diff options
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/vp9_block.h | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_context_tree.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_dct.c | 44 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 70 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 61 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 129 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 46 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 25 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.c | 50 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.h | 13 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 81 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.h | 46 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_dct_sse2.c | 208 |
19 files changed, 633 insertions, 172 deletions
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index c3cd93b78..2463ed0f4 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -28,6 +28,7 @@ struct macroblock_plane { struct buf_2d src; // Quantizer setings + int16_t *quant_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; @@ -48,7 +49,7 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; - int select_txfm_size; + int select_tx_size; int skip_recode; int skip_optimize; int q_index; @@ -105,6 +106,9 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // skip forward transform and quantization + int skip_txfm; + // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index bb384aacd..872223b75 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -33,6 +33,7 @@ typedef struct { int is_coded; int num_4x4_blk; int skip; + int skip_txfm; int best_mode_index; int hybrid_pred_diff; int comp_pred_diff; diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 577276764..5c99a0a78 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -43,6 +43,17 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = fdct_round_shift(temp2); } +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) + sum += input[r * stride + c]; + + output[0] = sum << 3; + output[1] = 0; +} + void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -240,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = fdct_round_shift(t3); } +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) + sum += input[r * stride + c]; + + output[0] = sum * 8; + output[1] = 0; +} + void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -311,6 +333,17 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { } } +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) + sum += input[r * stride + c]; + + output[0] = sum * 8; + output[1] = 0; +} + void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -1329,6 +1362,17 @@ static void fdct32(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) + sum += input[r * stride + c]; + + output[0] = sum << 2; + output[1] = 0; +} + void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 87e0c08d2..001ac69bd 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -697,6 +697,38 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, x->e_mbd.plane[i].subsampling_y); } +static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate, + int64_t *dist, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + INTERP_FILTER filter_ref; + + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1]->mbmi.interp_filter; + else + filter_ref = EIGHTTAP; + + mbmi->sb_type = bsize; + mbmi->mode = ZEROMV; + mbmi->tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[tx_mode]); + mbmi->skip = 1; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + mbmi->interp_filter = filter_ref; + + xd->mi[0]->bmi[0].as_mv[0].as_int = 0; + x->skip = 1; + x->skip_encode = 1; + + *rate = 0; + *dist = 0; +} + static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col, int *totalrate, int64_t *totaldist, @@ -1338,6 +1370,7 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, } x->skip = ctx->skip; + x->skip_txfm = mbmi->segment_id ? 0 : ctx->skip_txfm; } static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, @@ -2364,15 +2397,15 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { } } -static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) { +static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) { int mi_row, mi_col; const int mis = cm->mi_stride; MODE_INFO **mi_ptr = cm->mi_grid_visible; for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max) - mi_ptr[mi_col]->mbmi.tx_size = txfm_max; + if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size) + mi_ptr[mi_col]->mbmi.tx_size = max_tx_size; } } } @@ -2441,17 +2474,21 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; set_offsets(cpi, tile, mi_row, mi_col, bsize); - xd->mi[0]->mbmi.sb_type = bsize; + mbmi = &xd->mi[0]->mbmi; + mbmi->sb_type = bsize; if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - if (xd->mi[0]->mbmi.segment_id && x->in_static_area) + if (mbmi->segment_id && x->in_static_area) x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); } if (!frame_is_intra_only(cm)) { - vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, - rate, dist, bsize); + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize); + else + vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize); } else { set_mode_info(&xd->mi[0]->mbmi, bsize, DC_PRED); } @@ -2577,6 +2614,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize); ctx->mic.mbmi = xd->mi[0]->mbmi; + ctx->skip_txfm = x->skip_txfm; if (this_rate != INT_MAX) { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2663,6 +2701,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, &this_rate, &this_dist, subsize); pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm = x->skip_txfm; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); @@ -2672,6 +2711,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, &this_rate, &this_dist, subsize); pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm = x->skip_txfm; if (this_rate == INT_MAX) { sum_rd = INT64_MAX; @@ -2701,12 +2741,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, subsize); pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm = x->skip_txfm; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { load_pred_mv(x, ctx); nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, &this_dist, subsize); pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm = x->skip_txfm; if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -2795,14 +2837,17 @@ static void nonrd_use_partition(VP9_COMP *cpi, case PARTITION_NONE: nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); pc_tree->none.mic.mbmi = xd->mi[0]->mbmi; + pc_tree->none.skip_txfm = x->skip_txfm; break; case PARTITION_VERT: nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm = x->skip_txfm; if (mi_col + hbs < cm->mi_cols) { nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs, &rate, &dist, subsize); pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm = x->skip_txfm; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; @@ -2813,10 +2858,12 @@ static void nonrd_use_partition(VP9_COMP *cpi, case PARTITION_HORZ: nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize); pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm = x->skip_txfm; if (mi_row + hbs < cm->mi_rows) { nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col, &rate, &dist, subsize); pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm = x->skip_txfm; if (rate != INT_MAX && dist != INT64_MAX && *totrate != INT_MAX && *totdist != INT64_MAX) { *totrate += rate; @@ -3019,6 +3066,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { init_encode_frame_mb_context(cpi); set_prev_mi(cm); + x->skip_txfm = 0; if (sf->use_nonrd_pick_mode) { // Initialize internal buffer pointers for rtc coding, where non-RD // mode decision is used and hence no buffer pointer swap needed. @@ -3207,16 +3255,16 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && count32x32 == 0) { cm->tx_mode = ALLOW_8X8; - reset_skip_txfm_size(cm, TX_8X8); + reset_skip_tx_size(cm, TX_8X8); } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { cm->tx_mode = ONLY_4X4; - reset_skip_txfm_size(cm, TX_4X4); + reset_skip_tx_size(cm, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { cm->tx_mode = ALLOW_32X32; } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { cm->tx_mode = ALLOW_16X16; - reset_skip_txfm_size(cm, TX_16X16); + reset_skip_tx_size(cm, TX_16X16); } } } else { @@ -3277,7 +3325,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 && + x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 && cpi->oxcf.aq_mode != COMPLEXITY_AQ && cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ && cpi->sf.allow_skip_recode; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 8581e6117..1c00698b5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -301,6 +301,52 @@ static INLINE void fdct32x32(int rd_transform, vp9_fdct32x32(src, dst, src_stride); } +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + vp9_fdct32x32_1(src_diff, coeff, diff_stride); + vp9_quantize_dc_32x32(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_16X16: + vp9_fdct16x16_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_8X8: + vp9_fdct8x8_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + default: + assert(0); + } +} + void vp9_xform_quant(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; @@ -376,8 +422,19 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, return; } - if (!x->skip_recode) - vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + if (x->skip_txfm == 0) { + // full forward transform and quantization + if (!x->skip_recode) + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } else if (x->skip_txfm == 2) { + // fast path forward transform and quantization + vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; + return; + } if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { const int ctx = combine_entropy_contexts(*a, *l); diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 802145984..3196c9920 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -22,7 +22,8 @@ extern "C" { void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); - +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); void vp9_xform_quant(MACROBLOCK *x, int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 6c0e0b58d..aa7a91dc1 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -588,8 +588,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->oxcf = *oxcf; cpi->pass = get_pass(cpi->oxcf.mode); - if (cpi->oxcf.mode == REALTIME) - cpi->oxcf.play_alternate = 0; rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; @@ -2416,7 +2414,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->refresh_alt_ref_frame = 0; // Should we code an alternate reference frame. - if (cpi->oxcf.play_alternate && rc->source_alt_ref_pending) { + if (is_altref_enabled(&cpi->oxcf) && rc->source_alt_ref_pending) { int frames_to_arf; #if CONFIG_MULTIPLE_ARF diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7e8206119..6b0e228ca 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -284,6 +284,10 @@ typedef struct VP9EncoderConfig { vp8e_tuning tuning; } VP9EncoderConfig; +static INLINE int is_altref_enabled(const VP9EncoderConfig *cfg) { + return cfg->mode != REALTIME && cfg->play_alternate && cfg->lag_in_frames > 0; +} + static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index a49fe3df5..430f71394 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -137,14 +137,13 @@ static void output_stats(FIRSTPASS_STATS *stats, FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, stats->sr_coded_error, - stats->ssim_weighted_pred_err, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, @@ -597,73 +596,91 @@ void vp9_first_pass(VP9_COMP *cpi) { if (cm->current_video_frame > 0) { int tmp_err, motion_error; int_mv mv, tmp_mv; + int raw_motion_error; + struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); - // Assume 0,0 motion with no mv overhead. - mv.as_int = tmp_mv.as_int = 0; - - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, - &motion_error); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); - motion_error = (int)(motion_error * error_weight); - } - // If the current best reference mv is not centered on 0,0 then do a 0,0 - // based search as well. - if (best_ref_mv.as_int) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, - &tmp_err); + // compute the motion error of the zero motion vector using the last + // source frame as the reference + // skip the further motion search on reconstructed frame + // if this error is small + unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer + + recon_yoffset; + unscaled_last_source_buf_2d.stride = + cpi->unscaled_last_source->y_stride; + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25) { + // Assume 0,0 motion with no mv overhead. + mv.as_int = tmp_mv.as_int = 0; + + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv, + &motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { vp9_clear_system_state(); - tmp_err = (int)(tmp_err * error_weight); + motion_error = (int)(motion_error * error_weight); } - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv.as_int = tmp_mv.as_int; + // If the current best reference mv is not centered on 0,0 + // then do a 0,0 + // based search as well. + if (best_ref_mv.as_int) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, + &tmp_err); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + tmp_err = (int)(tmp_err * error_weight); + } + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv.as_int = tmp_mv.as_int; + } } - } - // Search in an older reference frame. - if (cm->current_video_frame > 1 && gld_yv12 != NULL) { - // Assume 0,0 motion with no mv overhead. - int gf_motion_error; + // Search in an older reference frame. + if (cm->current_video_frame > 1 && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; - xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, - &gf_motion_error); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); - gf_motion_error = (int)(gf_motion_error * error_weight); - } + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, + &gf_motion_error); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + gf_motion_error = (int)(gf_motion_error * error_weight); + } - if (gf_motion_error < motion_error && gf_motion_error < this_error) - ++second_ref_count; - - // Reset to last frame as reference buffer. - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame take the - // best of the motion predicted score and the intra coded error - // (just as will be done for) accumulation of "coded_error" for - // the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; - } else { - sr_coded_error += motion_error; + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++second_ref_count; + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + sr_coded_error += gf_motion_error; + else + sr_coded_error += this_error; + } else { + sr_coded_error += motion_error; + } } // Start by assuming that intra mode is best. best_ref_mv.as_int = 0; @@ -1508,7 +1525,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; - unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames; + unsigned int allow_alt_ref = is_altref_enabled(oxcf); int f_boost = 0; int b_boost = 0; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 309638c1e..82065213e 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -22,7 +22,6 @@ typedef struct { double intra_error; double coded_error; double sr_coded_error; - double ssim_weighted_pred_err; double pcnt_inter; double pcnt_motion; double pcnt_second_ref; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index dbd19a2d6..48ac5f929 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1593,3 +1593,49 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, } return best_sad; } + +int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = sf->search_method; + vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + int var = 0; + + switch (method) { + case FAST_DIAMOND: + var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case FAST_HEX: + var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case HEX: + var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case SQUARE: + var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case BIGDIA: + var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, + fn_ptr, 1, ref_mv, tmp_mv); + break; + case NSTEP: + var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + (sf->max_step_search_steps - 1) - step_param, + 1, fn_ptr, ref_mv, tmp_mv); + break; + default: + assert(!"Invalid search method."); + } + + if (method != NSTEP && rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); + + return var; +} diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 873edf376..07e410d3c 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -145,6 +145,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv, const uint8_t *second_pred); + +struct VP9_COMP; + +int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index d76d601ff..4b0b85ad4 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -84,8 +84,8 @@ static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv, - &tmp_mv->as_mv, INT_MAX, 0); + vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv, + &tmp_mv->as_mv, INT_MAX, 0); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -156,24 +156,28 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, unsigned int sse; int rate; int64_t dist; - struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - + const int quant = pd->dequant[1]; unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); - *var_y = var; *sse_y = sse; + if (sse < pd->dequant[0] * pd->dequant[0] >> 6) + x->skip_txfm = 1; + else if (var < quant * quant >> 6) + x->skip_txfm = 2; + else + x->skip_txfm = 0; + // TODO(jingning) This is a temporary solution to account for frames with // light changes. Need to customize the rate-distortion modeling for non-RD // mode decision. if ((sse >> 3) > var) sse = var; - vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize], - pd->dequant[1] >> 3, &rate, &dist); + quant >> 3, &rate, &dist); *out_rate_sum = rate; *out_dist_sum = dist << 3; } @@ -199,6 +203,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; int64_t this_rd = INT64_MAX; + int skip_txfm = 0; int rate = INT_MAX; int64_t dist = INT64_MAX; @@ -341,6 +346,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cost < best_cost) { best_filter = filter; best_cost = cost; + skip_txfm = x->skip_txfm; } } @@ -349,6 +355,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, dist = pf_dist[mbmi->interp_filter]; var_y = pf_var[mbmi->interp_filter]; sse_y = pf_sse[mbmi->interp_filter]; + x->skip_txfm = skip_txfm; } else { mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); @@ -438,6 +445,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, best_mode = this_mode; best_pred_filter = mbmi->interp_filter; best_ref_frame = ref_frame; + skip_txfm = x->skip_txfm; } if (x->skip) @@ -450,6 +458,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = best_ref_frame; mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; + x->skip_txfm = skip_txfm; // Perform intra prediction search, if the best SAD is above a certain // threshold. @@ -474,6 +483,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->ref_frame[0] = INTRA_FRAME; mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; + } else { + x->skip_txfm = skip_txfm; } } } diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 4d3086d60..f817bcc31 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -19,6 +19,50 @@ #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rdopt.h" +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + int eob = -1; + + if (!skip_block) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + int eob = -1; + + if (!skip_block) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -167,6 +211,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) : vp9_ac_quant(q, 0); invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); + quants->y_quant_fp[q][i] = (1 << 16) / quant; quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->y_round[q][i] = (qrounding_factor * quant) >> 7; cm->y_dequant[q][i] = quant; @@ -176,6 +221,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) { : vp9_ac_quant(q, cm->uv_ac_delta_q); invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], quant); + quants->uv_quant_fp[q][i] = (1 << 16) / quant; quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; cm->uv_dequant[q][i] = quant; @@ -193,12 +239,14 @@ void vp9_init_quantizer(VP9_COMP *cpi) { for (i = 2; i < 8; i++) { quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; quants->y_zbin[q][i] = quants->y_zbin[q][1]; quants->y_round[q][i] = quants->y_round[q][1]; cm->y_dequant[q][i] = cm->y_dequant[q][1]; quants->uv_quant[q][i] = quants->uv_quant[q][1]; + quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; quants->uv_round[q][i] = quants->uv_round[q][1]; @@ -227,6 +275,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // Y x->plane[0].quant = quants->y_quant[qindex]; + x->plane[0].quant_fp = quants->y_quant_fp[qindex]; x->plane[0].quant_shift = quants->y_quant_shift[qindex]; x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; @@ -236,6 +285,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { // UV for (i = 1; i < 3; i++) { x->plane[i].quant = quants->uv_quant[qindex]; + x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 1835e9ccc..0e90462eb 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -24,6 +24,11 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); @@ -37,6 +42,14 @@ typedef struct { #endif } QUANTS; +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index e8e425661..143c23ba9 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1129,7 +1129,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; - if (oxcf->play_alternate && cpi->refresh_alt_ref_frame && + if (is_altref_enabled(oxcf) && cpi->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME)) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); @@ -1389,8 +1389,7 @@ void vp9_rc_set_gf_max_interval(const VP9EncoderConfig *const oxcf, // Extended interval for genuinely static scenes rc->static_scene_max_gf_interval = oxcf->key_freq >> 1; - // Special conditions when alt ref frame enabled - if (oxcf->play_alternate && oxcf->lag_in_frames) { + if (is_altref_enabled(oxcf)) { if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 8e77dd05b..e110df2c4 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -311,8 +311,8 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO; x->errorperbit += (x->errorperbit == 0); - x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && - cm->frame_type != KEY_FRAME) ? 0 : 1; + x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) ? 0 : 1; set_block_thresholds(cm, rd); @@ -796,11 +796,11 @@ static void txfm_rd_in_plane(MACROBLOCK *x, } } -static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { +static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -815,12 +815,12 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, cpi->tx_stepdown_count[0]++; } -static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, - int64_t tx_cache[TX_MODES], - BLOCK_SIZE bs) { +static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, + int (*r)[2], int *rate, + int64_t *d, int64_t *distortion, + int *s, int *skip, + int64_t tx_cache[TX_MODES], + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -898,12 +898,12 @@ static int64_t scaled_rd_cost(int rdmult, int rddiv, return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale); } -static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { +static void choose_tx_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, + int (*r)[2], int *rate, + int64_t *d, int64_t *distortion, + int *s, int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -987,8 +987,8 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, - ref_best_rd, bs); + choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, + bs); if (psse) *psse = sse[mbmi->tx_size]; return; @@ -998,8 +998,8 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size], &sse[tx_size], ref_best_rd, 0, bs, tx_size, cpi->sf.use_fast_coef_costing); - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); + choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, + skip, txfm_cache, bs); if (psse) *psse = sse[mbmi->tx_size]; @@ -1017,8 +1017,8 @@ static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, assert(bs == mbmi->sb_type); if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, - ref_best_rd, bs); + choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, + bs); } else { int r[TX_SIZES][2], s[TX_SIZES]; int64_t d[TX_SIZES]; @@ -1028,8 +1028,8 @@ static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, &s[tx_size], &sse[tx_size], ref_best_rd, 0, bs, tx_size, cpi->sf.use_fast_coef_costing); - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); + choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, + bs); } if (psse) *psse = sse[mbmi->tx_size]; @@ -1330,7 +1330,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; - TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); + const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); int plane; int pnrate = 0, pnskip = 1; int64_t pndist = 0, pnsse = 0; @@ -1351,7 +1351,7 @@ static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, - ref_best_rd, plane, bsize, uv_txfm_size, + ref_best_rd, plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing); if (pnrate == INT_MAX) goto term; @@ -1403,7 +1403,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; - if (!x->select_txfm_size) + if (!x->select_tx_size) swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE); } } @@ -1850,9 +1850,9 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv); - bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - sadpb, &bsi->ref_mv[0]->as_mv, new_mv, - INT_MAX, 1); + bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, + sadpb, &bsi->ref_mv[0]->as_mv, new_mv, + INT_MAX, 1); // Should we do a full search (best quality only) if (is_best_mode(cpi->oxcf.mode)) { @@ -2385,8 +2385,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); + bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -2802,7 +2802,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate2 += vp9_get_switchable_rate(cpi); if (!is_comp_pred) { - if (!x->in_active_map) { + if (!x->in_active_map || + vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { if (psse) *psse = 0; *distortion = 0; @@ -3119,9 +3120,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { - const int inter_non_zero_mode_mask = 0x1F7F7; - mode_skip_mask |= inter_non_zero_mode_mask; - mode_skip_mask &= ~(1 << THR_ZEROMV); + mode_skip_mask = ~(1 << THR_ZEROMV); inter_mode_mask = (1 << ZEROMV); } @@ -3442,7 +3441,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); @@ -4072,7 +4071,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) + if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], sizeof(uint8_t) * ctx->num_4x4_blk); diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index e85d08a6d..6e5631795 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -94,52 +94,6 @@ static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; } -static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, MV *mvp_full, - int step_param, int error_per_bit, - const MV *ref_mv, MV *tmp_mv, - int var_max, int rd) { - const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->search_method; - vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; - int var = 0; - - switch (method) { - case FAST_DIAMOND: - var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, - fn_ptr, 1, ref_mv, tmp_mv); - break; - case FAST_HEX: - var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, - fn_ptr, 1, ref_mv, tmp_mv); - break; - case HEX: - var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); - break; - case SQUARE: - var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); - break; - case BIGDIA: - var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, - fn_ptr, 1, ref_mv, tmp_mv); - break; - case NSTEP: - var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - (sf->max_step_search_steps - 1) - step_param, - 1, fn_ptr, ref_mv, tmp_mv); - break; - default: - assert(!"Invalid search method."); - } - - if (method != NSTEP && rd && var < var_max) - var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); - - return var; -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index 1f58d872e..487deef42 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,6 +12,35 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in0); +} + void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { int pass; // Constants @@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + input += 8 * i; + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in1); +} + void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -2680,6 +2817,77 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, } } +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + _mm_store_si128((__m128i *)(output), in1); +} + #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" |