diff options
Diffstat (limited to 'vp9')
37 files changed, 1285 insertions, 1166 deletions
diff --git a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm index 6b20cb9bf..4d85846f0 100644 --- a/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm @@ -78,7 +78,7 @@ mov r10, r6 ; w loop counter -loop_horiz_v +vp9_convolve8_avg_loop_horiz_v vld1.8 {d24}, [r0], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d26}, [r0], r1 @@ -101,7 +101,7 @@ loop_horiz_v add r0, r0, #3 -loop_horiz +vp9_convolve8_avg_loop_horiz add r5, r0, #64 vld1.32 {d28[]}, [r0], r1 @@ -170,14 +170,14 @@ loop_horiz vmov q9, q13 subs r6, r6, #4 ; w -= 4 - bgt loop_horiz + bgt vp9_convolve8_avg_loop_horiz ; outer loop mov r6, r10 ; restore w counter add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt loop_horiz_v + bgt vp9_convolve8_avg_loop_horiz_v pop {r4-r10, pc} @@ -203,7 +203,7 @@ loop_horiz lsl r1, r1, #1 lsl r3, r3, #1 -loop_vert_h +vp9_convolve8_avg_loop_vert_h mov r4, r0 add r7, r0, r1, asr #1 mov r5, r2 @@ -223,7 +223,7 @@ loop_vert_h vmovl.u8 q10, d20 vmovl.u8 q11, d22 -loop_vert +vp9_convolve8_avg_loop_vert ; always process a 4x4 block at a time vld1.u32 {d24[0]}, [r7], r1 vld1.u32 {d26[0]}, [r4], r1 @@ -288,13 +288,13 @@ loop_vert vmov d22, d25 subs r12, r12, #4 ; h -= 4 - bgt loop_vert + bgt vp9_convolve8_avg_loop_vert ; outer loop add r0, r0, #4 add r2, r2, #4 subs r6, r6, #4 ; w -= 4 - bgt loop_vert_h + bgt vp9_convolve8_avg_loop_vert_h pop {r4-r8, pc} diff --git a/vp9/common/arm/neon/vp9_convolve8_neon.asm b/vp9/common/arm/neon/vp9_convolve8_neon.asm index 45258454c..184c3ad67 100644 --- a/vp9/common/arm/neon/vp9_convolve8_neon.asm +++ b/vp9/common/arm/neon/vp9_convolve8_neon.asm @@ -78,7 +78,7 @@ mov r10, r6 ; w loop counter -loop_horiz_v +vp9_convolve8_loop_horiz_v vld1.8 {d24}, [r0], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d26}, [r0], r1 @@ -101,7 +101,7 @@ loop_horiz_v add r0, r0, #3 -loop_horiz +vp9_convolve8_loop_horiz add r5, r0, #64 vld1.32 {d28[]}, [r0], r1 @@ -159,14 +159,14 @@ loop_horiz vmov q9, q13 subs r6, r6, #4 ; w -= 4 - bgt loop_horiz + bgt vp9_convolve8_loop_horiz ; outer loop mov r6, r10 ; restore w counter add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt loop_horiz_v + bgt vp9_convolve8_loop_horiz_v pop {r4-r10, pc} @@ -192,7 +192,7 @@ loop_horiz lsl r1, r1, #1 lsl r3, r3, #1 -loop_vert_h +vp9_convolve8_loop_vert_h mov r4, r0 add r7, r0, r1, asr #1 mov r5, r2 @@ -212,7 +212,7 @@ loop_vert_h vmovl.u8 q10, d20 vmovl.u8 q11, d22 -loop_vert +vp9_convolve8_loop_vert ; always process a 4x4 block at a time vld1.u32 {d24[0]}, [r7], r1 vld1.u32 {d26[0]}, [r4], r1 @@ -266,13 +266,13 @@ loop_vert vmov d22, d25 subs r12, r12, #4 ; h -= 4 - bgt loop_vert + bgt vp9_convolve8_loop_vert ; outer loop add r0, r0, #4 add r2, r2, #4 subs r6, r6, #4 ; w -= 4 - bgt loop_vert_h + bgt vp9_convolve8_loop_vert_h pop {r4-r8, pc} diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index c3fdeb48a..77a8709f0 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -748,10 +748,10 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { } void vp9_default_coef_probs(VP9_COMMON *cm) { - vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4); - vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8); - vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16); - vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); + vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4); + vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8); + vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16); + vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32); } #define COEF_COUNT_SAT 24 @@ -765,7 +765,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, unsigned int count_sat, unsigned int update_factor) { const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size]; + vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size]; const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size]; vp9_coeff_count_model *counts = cm->counts.coef[tx_size]; unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 5b00b0082..1a24572ba 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -350,7 +350,7 @@ static void adapt_probs(const vp9_tree_index *tree, void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; - FRAME_CONTEXT *fc = &cm->fc; + FRAME_CONTEXT *fc = cm->fc; const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; const FRAME_COUNTS *counts = &cm->counts; @@ -451,17 +451,17 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { lf->last_sharpness_level = -1; vp9_default_coef_probs(cm); - vp9_init_mode_probs(&cm->fc); + vp9_init_mode_probs(cm->fc); vp9_init_mv_probs(cm); if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->reset_frame_context == 3) { // Reset all frame contexts. for (i = 0; i < FRAME_CONTEXTS; ++i) - cm->frame_contexts[i] = cm->fc; + cm->frame_contexts[i] = *cm->fc; } else if (cm->reset_frame_context == 2) { // Reset only the frame context specified in the frame header. - cm->frame_contexts[cm->frame_context_idx] = cm->fc; + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; } if (frame_is_intra_only(cm)) diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 5bb048202..922c03947 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -196,7 +196,7 @@ static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { int i, j; - nmv_context *fc = &cm->fc.nmvc; + nmv_context *fc = &cm->fc->nmvc; const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; const nmv_context_counts *counts = &cm->counts.mv; @@ -229,5 +229,5 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { } void vp9_init_mv_probs(VP9_COMMON *cm) { - cm->fc.nmvc = default_nmv_context; + cm->fc->nmvc = default_nmv_context; } diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index aca8d7b33..43a4fe5b9 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1625,6 +1625,17 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, y_only); } +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) { + lf_data->frame_buffer = frame_buffer; + lf_data->cm = cm; + lf_data->start = 0; + lf_data->stop = 0; + lf_data->y_only = 0; + vpx_memcpy(lf_data->planes, planes, sizeof(lf_data->planes)); +} + int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { (void)unused; vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 0ede58ae4..4c15e6bd4 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -124,11 +124,12 @@ typedef struct LoopFilterWorkerData { int start; int stop; int y_only; - - struct VP9LfSyncData *lf_sync; - int num_lf_workers; } LFWorkerData; +void vp9_loop_filter_data_reset( + LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); + // Operates on the rows described by 'lf_data'. int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); #ifdef __cplusplus diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index f1eda9117..b818ae818 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -169,8 +169,8 @@ typedef struct VP9Common { MV_REFERENCE_FRAME comp_var_ref[2]; REFERENCE_MODE reference_mode; - FRAME_CONTEXT fc; /* this frame entropy */ - FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS]; + FRAME_CONTEXT *fc; /* this frame entropy */ + FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS unsigned int frame_context_idx; /* Context to use/update */ FRAME_COUNTS counts; @@ -261,7 +261,7 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm, int ctx) { return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx] - : cm->fc.partition_prob[ctx]; + : cm->fc->partition_prob[ctx]; } static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h index 39774f142..cf13e4a91 100644 --- a/vp9/common/vp9_pred_common.h +++ b/vp9/common/vp9_pred_common.h @@ -54,7 +54,7 @@ static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.skip_probs[vp9_get_skip_context(xd)]; + return cm->fc->skip_probs[vp9_get_skip_context(xd)]; } int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); @@ -63,14 +63,14 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)]; + return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)]; } int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; + return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; } int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, @@ -79,21 +79,21 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd); - return cm->fc.comp_ref_prob[pred_context]; + return cm->fc->comp_ref_prob[pred_context]; } int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; } int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; + return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } int vp9_get_tx_size_context(const MACROBLOCKD *xd); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index dc712f045..66da63ac6 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -127,7 +127,7 @@ static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, } static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) { - FRAME_CONTEXT *const fc = &cm->fc; + FRAME_CONTEXT *const fc = cm->fc; int i; if (cm->reference_mode == REFERENCE_MODE_SELECT) @@ -902,11 +902,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; // Be sure to sync as we might be resuming after a failed frame decode. winterface->sync(&pbi->lf_worker); - lf_data->frame_buffer = get_frame_new_buffer(cm); - lf_data->cm = cm; - vp9_copy(lf_data->planes, pbi->mb.plane); - lf_data->stop = 0; - lf_data->y_only = 0; + vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm, + pbi->mb.plane); vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } @@ -1065,14 +1062,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, // use num_threads - 1 workers. CHECK_MEM_ERROR(cm, pbi->tile_workers, vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + // Ensure tile data offsets will be properly aligned. This may fail on + // platforms without DECLARE_ALIGNED(). + assert((sizeof(*pbi->tile_worker_data) % 16) == 0); + CHECK_MEM_ERROR(cm, pbi->tile_worker_data, + vpx_memalign(32, num_threads * + sizeof(*pbi->tile_worker_data))); + CHECK_MEM_ERROR(cm, pbi->tile_worker_info, + vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info))); for (i = 0; i < num_threads; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; ++pbi->num_tile_workers; winterface->init(worker); - CHECK_MEM_ERROR(cm, worker->data1, - vpx_memalign(32, sizeof(TileWorkerData))); - CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); if (i < num_threads - 1 && !winterface->reset(worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Tile decoder thread creation failed"); @@ -1082,8 +1084,11 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { - winterface->sync(&pbi->tile_workers[n]); - pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook; + VP9Worker *const worker = &pbi->tile_workers[n]; + winterface->sync(worker); + worker->hook = (VP9WorkerHook)tile_worker_hook; + worker->data1 = &pbi->tile_worker_data[n]; + worker->data2 = &pbi->tile_worker_info[n]; } // Note: this memset assumes above_context[0], [1] and [2] @@ -1386,7 +1391,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, size_t partition_size) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - FRAME_CONTEXT *const fc = &cm->fc; + FRAME_CONTEXT *const fc = cm->fc; vp9_reader r; int k; @@ -1540,7 +1545,7 @@ void vp9_decode_frame(VP9Decoder *pbi, setup_plane_dequants(cm, xd, cm->base_qindex); vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); - cm->fc = cm->frame_contexts[cm->frame_context_idx]; + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; vp9_zero(cm->counts); vp9_zero(xd->dqcoeff); @@ -1555,7 +1560,9 @@ void vp9_decode_frame(VP9Decoder *pbi, if (!xd->corrupted) { // If multiple threads are used to decode tiles, then we use those threads // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); + vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm, + pbi->tile_workers, pbi->num_tile_workers, + cm->lf.filter_level, 0); } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); @@ -1580,5 +1587,5 @@ void vp9_decode_frame(VP9Decoder *pbi, } if (cm->refresh_frame_context) - cm->frame_contexts[cm->frame_context_idx] = cm->fc; + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; } diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index a01fe842e..d0e0b76da 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -30,7 +30,7 @@ static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, int size_group) { const PREDICTION_MODE y_mode = - read_intra_mode(r, cm->fc.y_mode_prob[size_group]); + read_intra_mode(r, cm->fc->y_mode_prob[size_group]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.y_mode[size_group][y_mode]; return y_mode; @@ -39,7 +39,7 @@ static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, PREDICTION_MODE y_mode) { const PREDICTION_MODE uv_mode = read_intra_mode(r, - cm->fc.uv_mode_prob[y_mode]); + cm->fc->uv_mode_prob[y_mode]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.uv_mode[y_mode][uv_mode]; return uv_mode; @@ -47,7 +47,7 @@ static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) { const int mode = vp9_read_tree(r, vp9_inter_mode_tree, - cm->fc.inter_mode_probs[ctx]); + cm->fc->inter_mode_probs[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.inter_mode[ctx][mode]; @@ -61,7 +61,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_SIZE max_tx_size, vp9_reader *r) { const int ctx = vp9_get_tx_size_context(xd); - const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs); + const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs); int tx_size = vp9_read(r, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { tx_size += vp9_read(r, tx_probs[1]); @@ -150,7 +150,7 @@ static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, return 1; } else { const int ctx = vp9_get_skip_context(xd); - const int skip = vp9_read(r, cm->fc.skip_probs[ctx]); + const int skip = vp9_read(r, cm->fc->skip_probs[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.skip[ctx][skip]; return skip; @@ -258,7 +258,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, if (cm->reference_mode == REFERENCE_MODE_SELECT) { const int ctx = vp9_get_reference_mode_context(cm, xd); const REFERENCE_MODE mode = - (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]); + (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.comp_inter[ctx][mode]; return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE @@ -271,7 +271,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { - FRAME_CONTEXT *const fc = &cm->fc; + FRAME_CONTEXT *const fc = cm->fc; FRAME_COUNTS *const counts = &cm->counts; if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { @@ -317,7 +317,7 @@ static INLINE INTERP_FILTER read_switchable_interp_filter( const int ctx = vp9_get_pred_context_switchable_interp(xd); const INTERP_FILTER type = (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree, - cm->fc.switchable_interp_prob[ctx]); + cm->fc->switchable_interp_prob[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.switchable_interp[ctx][type]; return type; @@ -372,7 +372,7 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode, nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts.mv; for (i = 0; i < 1 + is_compound; ++i) { - read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts, + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts, allow_hp); ret = ret && is_mv_valid(&mv[i].as_mv); } @@ -410,7 +410,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, INTRA_FRAME; } else { const int ctx = vp9_get_intra_inter_context(xd); - const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]); + const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.intra_inter[ctx][is_inter]; return is_inter; diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index baf6ab7ef..fa2f01041 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -59,6 +59,13 @@ VP9Decoder *vp9_decoder_create() { } cm->error.setjmp = 1; + + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(cm, cm->frame_contexts, + (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, + sizeof(*cm->frame_contexts))); + pbi->need_resync = 1; initialize_dec(); @@ -88,15 +95,20 @@ void vp9_decoder_remove(VP9Decoder *pbi) { VP9_COMMON *const cm = &pbi->common; int i; + vpx_free(cm->fc); + cm->fc = NULL; + vpx_free(cm->frame_contexts); + cm->frame_contexts = NULL; + vp9_get_worker_interface()->end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; vp9_get_worker_interface()->end(worker); - vpx_free(worker->data1); - vpx_free(worker->data2); } + vpx_free(pbi->tile_worker_data); + vpx_free(pbi->tile_worker_info); vpx_free(pbi->tile_workers); if (pbi->num_tile_workers > 0) { diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 4f52bb9c4..25b7339ed 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -46,6 +46,8 @@ typedef struct VP9Decoder { VP9Worker lf_worker; VP9Worker *tile_workers; + TileWorkerData *tile_worker_data; + TileInfo *tile_worker_info; int num_tile_workers; TileData *tile_data; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 421229a28..8704fddac 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -58,7 +58,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, int ctx, const int16_t *scan, const int16_t *nb, vp9_reader *r) { const int max_eob = 16 << (tx_size << 1); - const FRAME_CONTEXT *const fc = &cm->fc; + const FRAME_CONTEXT *const fc = cm->fc; FRAME_COUNTS *const counts = &cm->counts; const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi); int band, c = 0; diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index 69e4fde85..3d2d0dd2e 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -92,12 +92,12 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, VP9_COMMON *const cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only, - VP9LfSync *const lf_sync, int num_lf_workers) { + VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; int r, c; // SB row and col const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; - for (r = start; r < stop; r += num_lf_workers) { + for (r = start; r < stop; r += lf_sync->num_workers) { const int mi_row = r << MI_BLOCK_SIZE_LOG2; MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; @@ -121,35 +121,35 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(TileWorkerData *const tile_data, - void *unused) { - LFWorkerData *const lf_data = &tile_data->lfdata; - (void)unused; +static int loop_filter_row_worker(VP9LfSync *const lf_sync, + LFWorkerData *const lf_data) { loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, - lf_data->start, lf_data->stop, lf_data->y_only, - lf_data->lf_sync, lf_data->num_lf_workers); + lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); return 1; } // VP9 decoder: Implement multi-threaded loopfilter that uses the tile // threads. -void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, - VP9Decoder *pbi, VP9_COMMON *cm, +void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, + YV12_BUFFER_CONFIG *frame, + struct macroblockd_plane planes[MAX_MB_PLANE], + VP9_COMMON *cm, + VP9Worker *workers, int nworkers, int frame_filter_level, int y_only) { - VP9LfSync *const lf_sync = &pbi->lf_row_sync; const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; const int tile_cols = 1 << cm->log2_tile_cols; - const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); + const int num_workers = MIN(nworkers, tile_cols); int i; if (!frame_filter_level) return; - if (!lf_sync->sync_range || cm->last_height != cm->height) { + if (!lf_sync->sync_range || cm->last_height != cm->height || + num_workers > lf_sync->num_workers) { vp9_loop_filter_dealloc(lf_sync); - vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } vp9_loop_filter_frame_init(cm, frame_filter_level); @@ -158,32 +158,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); // Set up loopfilter thread data. - // The decoder is using num_workers instead of pbi->num_tile_workers - // because it has been observed that using more threads on the - // loopfilter, than there are tile columns in the frame will hurt - // performance on Android. This is because the system will only - // schedule the tile decode workers on cores equal to the number - // of tile columns. Then if the decoder tries to use more threads for the - // loopfilter, it will hurt performance because of contention. If the - // multithreading code changes in the future then the number of workers - // used by the loopfilter should be revisited. + // The decoder is capping num_workers because it has been observed that using + // more threads on the loopfilter than there are cores will hurt performance + // on Android. This is because the system will only schedule the tile decode + // workers on cores equal to the number of tile columns. Then if the decoder + // tries to use more threads for the loopfilter, it will hurt performance + // because of contention. If the multithreading code changes in the future + // then the number of workers used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; - TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; - LFWorkerData *const lf_data = &tile_data->lfdata; + VP9Worker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; worker->hook = (VP9WorkerHook)loop_filter_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; // Loopfilter data - lf_data->frame_buffer = frame; - lf_data->cm = cm; - vp9_copy(lf_data->planes, pbi->mb.plane); + vp9_loop_filter_data_reset(lf_data, frame, cm, planes); lf_data->start = i; lf_data->stop = sb_rows; - lf_data->y_only = y_only; // always do all planes in decoder - - lf_data->lf_sync = lf_sync; - lf_data->num_lf_workers = num_workers; + lf_data->y_only = y_only; // Start loopfiltering if (i == num_workers - 1) { @@ -195,7 +189,7 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, // Wait till all rows are finished for (i = 0; i < num_workers; ++i) { - winterface->sync(&pbi->tile_workers[i]); + winterface->sync(&workers[i]); } } @@ -215,7 +209,7 @@ static int get_sync_range(int width) { // Allocate memory for lf row synchronization void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, - int width) { + int width, int num_workers) { lf_sync->rows = rows; #if CONFIG_MULTITHREAD { @@ -239,6 +233,10 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, } #endif // CONFIG_MULTITHREAD + CHECK_MEM_ERROR(cm, lf_sync->lfdata, + vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); + lf_sync->num_workers = num_workers; + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); @@ -265,6 +263,7 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { vpx_free(lf_sync->cond_); } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h index b1fbdeb74..d5810b45b 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/decoder/vp9_dthread.h @@ -22,9 +22,6 @@ typedef struct TileWorkerData { struct VP9Common *cm; vp9_reader bit_reader; DECLARE_ALIGNED(16, struct macroblockd, xd); - - // Row-based parallel loopfilter data - LFWorkerData lfdata; } TileWorkerData; // Loopfilter row synchronization @@ -39,19 +36,25 @@ typedef struct VP9LfSyncData { // determined by testing. Currently, it is chosen to be a power-of-2 number. int sync_range; int rows; + + // Row-based parallel loopfilter data + LFWorkerData *lfdata; + int num_workers; } VP9LfSync; // Allocate memory for loopfilter row synchronization. void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, - int width); + int width, int num_workers); // Deallocate loopfilter synchronization related mutex and data. void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. -void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, - struct VP9Decoder *pbi, +void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, + YV12_BUFFER_CONFIG *frame, + struct macroblockd_plane planes[MAX_MB_PLANE], struct VP9Common *cm, + VP9Worker *workers, int num_workers, int frame_filter_level, int y_only); diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 421e04969..7cfd14307 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -84,7 +84,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, vp9_writer *w) { const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, - &cm->fc.tx_probs); + &cm->fc->tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -108,14 +108,14 @@ static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) { int k; for (k = 0; k < SKIP_CONTEXTS; ++k) - vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]); + vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], cm->counts.skip[k]); } static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) { int j; for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) prob_diff_update(vp9_switchable_interp_tree, - cm->fc.switchable_interp_prob[j], + cm->fc->switchable_interp_prob[j], cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w); } @@ -237,7 +237,7 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, vp9_writer *w) { VP9_COMMON *const cm = &cpi->common; - const nmv_context *nmvc = &cm->fc.nmvc; + const nmv_context *nmvc = &cm->fc->nmvc; const MACROBLOCK *const x = &cpi->mb; const MACROBLOCKD *const xd = &x->e_mbd; const struct segmentation *const seg = &cm->seg; @@ -275,7 +275,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (!is_inter) { if (bsize >= BLOCK_8X8) { - write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]); + write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]); } else { int idx, idy; const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; @@ -283,14 +283,14 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; - write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]); + write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]); } } } - write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]); + write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]); } else { const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; - const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx]; + const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx]; write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. @@ -304,7 +304,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (cm->interp_filter == SWITCHABLE) { const int ctx = vp9_get_pred_context_switchable_interp(xd); vp9_write_token(w, vp9_switchable_interp_tree, - cm->fc.switchable_interp_prob[ctx], + cm->fc->switchable_interp_prob[ctx], &switchable_interp_encodings[mbmi->interp_filter]); ++cpi->interp_filter_selected[0][mbmi->interp_filter]; } else { @@ -528,7 +528,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, TX_SIZE tx_size, vp9_coeff_stats *frame_branch_ct, vp9_coeff_probs_model *new_coef_probs) { - vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size]; + vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size]; const vp9_prob upd = DIFF_UPDATE_PROB; const int entropy_nodes_update = UNCONSTRAINED_NODES; int i, j, k, l, t; @@ -830,20 +830,20 @@ static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) { for (i = 0; i < TX_SIZE_CONTEXTS; i++) { tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p); for (j = 0; j < TX_SIZES - 3; j++) - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]); + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p); for (j = 0; j < TX_SIZES - 2; j++) - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j], + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j], ct_16x16p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p); for (j = 0; j < TX_SIZES - 1; j++) - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j], + vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j], ct_32x32p[j]); } } @@ -929,13 +929,11 @@ static int get_refresh_mask(VP9_COMP *cpi) { static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { VP9_COMMON *const cm = &cpi->common; vp9_writer residual_bc; - int tile_row, tile_col; TOKENEXTRA *tok[4][1 << 6], *tok_end; size_t total_size = 0; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; - TileInfo tile[4][1 << 6]; TOKENEXTRA *pre_tok = cpi->tok; int tile_tok = 0; @@ -944,18 +942,16 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col); - + int tile_idx = tile_row * tile_cols + tile_col; tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = tok[tile_row][tile_col]; - tile_tok = allocated_tokens(tile[tile_row][tile_col]); + tile_tok = allocated_tokens(cpi->tile_data[tile_idx].tile_info); } } for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { - const TileInfo * const ptile = &tile[tile_row][tile_col]; - + int tile_idx = tile_row * tile_cols + tile_col; tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) @@ -963,7 +959,8 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { else vp9_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end); + write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, + &residual_bc, &tok[tile_row][tile_col], tok_end); assert(tok[tile_row][tile_col] == tok_end); vp9_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { @@ -1161,7 +1158,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - FRAME_CONTEXT *const fc = &cm->fc; + FRAME_CONTEXT *const fc = cm->fc; vp9_writer header_bc; vp9_start_encode(&header_bc, data); @@ -1178,7 +1175,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { int i; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i], + prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i], cm->counts.inter_mode[i], INTER_MODES, &header_bc); vp9_zero(cm->counts.inter_mode); @@ -1219,7 +1216,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { cm->counts.comp_ref[i]); for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) - prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i], + prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i], cm->counts.y_mode[i], INTRA_MODES, &header_bc); for (i = 0; i < PARTITION_CONTEXTS; ++i) diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index 6b28ee591..47d9580a8 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -34,6 +34,7 @@ typedef struct { int is_coded; int num_4x4_blk; int skip; + int pred_pixel_ready; // For current partition, only if all Y, U, and V transform blocks' // coefficients are quantized to 0, skippable is set to 0. int skippable; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f4e71aeb3..baa4908d4 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -467,7 +467,6 @@ static void choose_partitioning(VP9_COMP *cpi, int sp; int dp; int pixels_wide = 64, pixels_high = 64; - int_mv nearest_mv, near_mv; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; @@ -488,11 +487,7 @@ static void choose_partitioning(VP9_COMP *cpi, xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME; xd->mi[0].src_mi->mbmi.sb_type = BLOCK_64X64; - vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, - xd->mi[0].src_mi->mbmi.ref_mvs[LAST_FRAME], - &nearest_mv, &near_mv); - - xd->mi[0].src_mi->mbmi.mv[0] = nearest_mv; + xd->mi[0].src_mi->mbmi.mv[0].as_int = 0; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64); d = xd->plane[0].dst.buf; @@ -750,8 +745,8 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, x->e_mbd.plane[i].subsampling_y); } -static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate, - int64_t *dist, BLOCK_SIZE bsize) { +static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, + RD_COST *rd_cost, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi; INTERP_FILTER filter_ref; @@ -777,15 +772,16 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate, xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = 0; x->skip = 1; - *rate = 0; - *dist = 0; + vp9_rd_cost_init(rd_cost); } -static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, +static void rd_pick_sb_modes(VP9_COMP *cpi, + TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; @@ -801,7 +797,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); mbmi = &xd->mi[0].src_mi->mbmi; mbmi->sb_type = bsize; @@ -813,6 +809,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, } ctx->is_coded = 0; ctx->skippable = 0; + ctx->pred_pixel_ready = 0; x->skip_recode = 0; // Set to zero to make sure we do not use the previous encoded frame stats @@ -872,14 +869,14 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, } else { if (bsize >= BLOCK_8X8) { if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) - vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, rd_cost, bsize, + vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else - vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, + vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); } else { - vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, rd_cost, - bsize, ctx, best_rd); + vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, + rd_cost, bsize, ctx, best_rd); } } @@ -1161,79 +1158,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } -static void copy_partitioning(VP9_COMMON *cm, MODE_INFO *mi_8x8, - MODE_INFO *prev_mi_8x8) { - const int mis = cm->mi_stride; - int block_row, block_col; - - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - MODE_INFO *const prev_mi = - prev_mi_8x8[block_row * mis + block_col].src_mi; - const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - - if (prev_mi) { - const ptrdiff_t offset = prev_mi - cm->prev_mi; - mi_8x8[block_row * mis + block_col].src_mi = cm->mi + offset; - mi_8x8[block_row * mis + block_col].src_mi->mbmi.sb_type = sb_type; - } - } - } -} - -static void constrain_copy_partitioning(VP9_COMP *const cpi, - const TileInfo *const tile, - MODE_INFO *mi_8x8, - MODE_INFO *prev_mi_8x8, - int mi_row, int mi_col, - BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mi_stride; - const int row8x8_remaining = tile->mi_row_end - mi_row; - const int col8x8_remaining = tile->mi_col_end - mi_col; - MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col; - const int bh = num_8x8_blocks_high_lookup[bsize]; - const int bw = num_8x8_blocks_wide_lookup[bsize]; - int block_row, block_col; - - assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); - - // If the SB64 if it is all "in image". - if ((col8x8_remaining >= MI_BLOCK_SIZE) && - (row8x8_remaining >= MI_BLOCK_SIZE)) { - for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { - for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { - const int index = block_row * mis + block_col; - MODE_INFO *prev_mi = prev_mi_8x8[index].src_mi; - const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - // Use previous partition if block size is not larger than bsize. - if (prev_mi && sb_type <= bsize) { - int block_row2, block_col2; - for (block_row2 = 0; block_row2 < bh; ++block_row2) { - for (block_col2 = 0; block_col2 < bw; ++block_col2) { - const int index2 = (block_row + block_row2) * mis + - block_col + block_col2; - prev_mi = prev_mi_8x8[index2].src_mi; - if (prev_mi) { - const ptrdiff_t offset = prev_mi - cm->prev_mi; - mi_8x8[index2].src_mi = cm->mi + offset; - mi_8x8[index2].src_mi->mbmi.sb_type = prev_mi->mbmi.sb_type; - } - } - } - } else { - // Otherwise, use fixed partition of size bsize. - mi_8x8[index].src_mi = mi_upper_left + index; - mi_8x8[index].src_mi->mbmi.sb_type = bsize; - } - } - } - } else { - // Else this is a partial SB64, copy previous partition. - copy_partitioning(cm, mi_8x8, prev_mi_8x8); - } -} - const struct { int row; int col; @@ -1364,27 +1288,6 @@ static int is_background(const VP9_COMP *cpi, const TileInfo *const tile, return this_sad < 2 * threshold; } -static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO *prev_mi_8x8, - const int motion_thresh) { - const int mis = cm->mi_stride; - int block_row, block_col; - - if (cm->prev_mi) { - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - const MODE_INFO *prev_mi = - prev_mi_8x8[block_row * mis + block_col].src_mi; - if (prev_mi) { - if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh || - abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh) - return 1; - } - } - } - } - return 0; -} - static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, int bsize) { VP9_COMMON *const cm = &cpi->common; @@ -1516,12 +1419,15 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, update_partition_context(xd, mi_row, mi_col, subsize, bsize); } -static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, +static void rd_use_partition(VP9_COMP *cpi, + TileDataEnc *tile_data, MODE_INFO *mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, - BLOCK_SIZE bsize, int *rate, int64_t *dist, + BLOCK_SIZE bsize, + int *rate, int64_t *dist, int do_recon, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; const int mis = cm->mi_stride; @@ -1557,7 +1463,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } @@ -1583,7 +1489,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, mi_row + (mi_step >> 1) < cm->mi_rows && mi_col + (mi_step >> 1) < cm->mi_cols) { pc_tree->partitioning = PARTITION_NONE; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rdc, bsize, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &none_rdc, bsize, ctx, INT64_MAX); pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -1602,11 +1508,11 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, switch (partition) { case PARTITION_NONE: - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc, bsize, ctx, INT64_MAX); break; case PARTITION_HORZ: - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->horizontal[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && @@ -1616,7 +1522,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); - rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &tmp_rdc, + rd_pick_sb_modes(cpi, tile_data, + mi_row + (mi_step >> 1), mi_col, &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); @@ -1628,7 +1535,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, } break; case PARTITION_VERT: - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->vertical[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { @@ -1637,7 +1544,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); - rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, + rd_pick_sb_modes(cpi, tile_data, + mi_row, mi_col + (mi_step >> 1), &tmp_rdc, subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { @@ -1651,7 +1559,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, break; case PARTITION_SPLIT: if (bsize == BLOCK_8X8) { - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &last_part_rdc, subsize, pc_tree->leaf_split[0], INT64_MAX); break; } @@ -1667,7 +1575,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, continue; vp9_rd_cost_init(&tmp_rdc); - rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, + rd_use_partition(cpi, tile_data, + mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, &tmp_rdc.dist, i != 3, pc_tree->split[i]); @@ -1718,7 +1627,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); pc_tree->split[i]->partitioning = PARTITION_NONE; - rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, + rd_pick_sb_modes(cpi, tile_data, + mi_row + y_idx, mi_col + x_idx, &tmp_rdc, split_subsize, &pc_tree->split[i]->none, INT64_MAX); restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1732,7 +1642,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, chosen_rdc.dist += tmp_rdc.dist; if (i != 3) - encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, + encode_sb(cpi, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0, split_subsize, pc_tree->split[i]); pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, @@ -1782,7 +1692,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, chosen_rdc.rate, chosen_rdc.dist); - encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, + encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); } @@ -2115,11 +2025,13 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, +static void rd_pick_partition(VP9_COMP *cpi, + TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost, int64_t best_rd, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2; @@ -2162,7 +2074,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, vp9_rd_cost_reset(&best_rdc); best_rdc.rdcost = best_rd; - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) x->mb_energy = vp9_block_energy(cpi, x, bsize); @@ -2194,7 +2106,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize); } @@ -2253,8 +2165,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_NONE if (partition_none_allowed) { - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rdc, bsize, ctx, - best_rdc.rdcost); + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, + &this_rdc, bsize, ctx, best_rdc.rdcost); if (this_rdc.rate != INT_MAX) { if (bsize >= BLOCK_8X8) { pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2323,7 +2235,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } if (skip) { if (src_diff_var == UINT_MAX) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); src_diff_var = get_sby_perpixel_diff_variance( cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize); } @@ -2353,7 +2265,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) pc_tree->leaf_split[0]->pred_interp_filter = ctx->mic.mbmi.interp_filter; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize, pc_tree->leaf_split[0], best_rdc.rdcost); if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX; @@ -2369,7 +2281,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, load_pred_mv(x, ctx); pc_tree->split[i]->index = i; - rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, + rd_pick_partition(cpi, tile_data, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); @@ -2412,7 +2325,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) pc_tree->horizontal[0].pred_interp_filter = ctx->mic.mbmi.interp_filter; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->horizontal[0], best_rdc.rdcost); if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && @@ -2427,8 +2340,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = ctx->mic.mbmi.interp_filter; - rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rdc, - subsize, &pc_tree->horizontal[1], + rd_pick_sb_modes(cpi, tile_data, mi_row + mi_step, mi_col, + &this_rdc, subsize, &pc_tree->horizontal[1], best_rdc.rdcost - sum_rdc.rdcost); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; @@ -2460,7 +2373,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) pc_tree->vertical[0].pred_interp_filter = ctx->mic.mbmi.interp_filter; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->vertical[0], best_rdc.rdcost); if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) { @@ -2474,7 +2387,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) pc_tree->vertical[1].pred_interp_filter = ctx->mic.mbmi.interp_filter; - rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rdc, subsize, + rd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + mi_step, + &this_rdc, subsize, &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; @@ -2520,7 +2434,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, best_rdc.rate, best_rdc.dist); - encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); + encode_sb(cpi, tile_info, tp, mi_row, mi_col, output_enabled, + bsize, pc_tree); } if (bsize == BLOCK_64X64) { @@ -2532,9 +2447,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } -static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp) { +static void encode_rd_sb_row(VP9_COMP *cpi, + TileDataEnc *tile_data, + int mi_row, + TOKENEXTRA **tp) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &cpi->mb.e_mbd; SPEED_FEATURES *const sf = &cpi->sf; int mi_col; @@ -2544,7 +2462,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += MI_BLOCK_SIZE) { int dummy_rate; int64_t dummy_dist; @@ -2553,10 +2471,6 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO *mi = cm->mi + idx_str; - MODE_INFO *prev_mi = NULL; - - if (cm->frame_type != KEY_FRAME) - prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi; if (sf->adaptive_pred_interp_filter) { for (i = 0; i < 64; ++i) @@ -2573,56 +2487,34 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, vp9_zero(cpi->mb.pred_mv); cpi->pc_root->index = 0; - // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good- - // quality encoding. Need to evaluate it in real-time encoding later to - // decide if it can be removed too. And then, do the code cleanup. cpi->mb.source_variance = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION) { - set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, + set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, sf->always_this_block_size); - rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, cpi->pc_root); + rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else if (cpi->partition_search_skippable_frame) { BLOCK_SIZE bsize; - set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64); bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col); - set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); - rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, cpi->pc_root); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else if (sf->partition_search_type == VAR_BASED_PARTITION && cm->frame_type != KEY_FRAME ) { - choose_partitioning(cpi, tile, mi_row, mi_col); - rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, cpi->pc_root); - } else if (sf->partition_search_type == SEARCH_PARTITION && - sf->use_lastframe_partitioning && - (cpi->rc.frames_since_key % - sf->last_partitioning_redo_frequency) && - cm->prev_mi && - cm->show_frame && - cm->frame_type != KEY_FRAME && - !cpi->rc.is_src_frame_alt_ref && - ((sf->use_lastframe_partitioning != - LAST_FRAME_PARTITION_LOW_MOTION) || - !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) { - if (sf->constrain_copy_partition && - sb_has_motion(cm, prev_mi, sf->lf_motion_threshold)) - constrain_copy_partitioning(cpi, tile, mi, prev_mi, - mi_row, mi_col, BLOCK_16X16); - else - copy_partitioning(cm, mi, prev_mi); - rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, cpi->pc_root); + choose_partitioning(cpi, tile_info, mi_row, mi_col); + rd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else { // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { - set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - rd_auto_partition_range(cpi, tile, mi_row, mi_col, + set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, tile_info, mi_row, mi_col, &sf->min_partition_size, &sf->max_partition_size); } - rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, + rd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, INT64_MAX, cpi->pc_root); } } @@ -2695,15 +2587,16 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) { return cpi->common.tx_mode; } -static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, - int *rate, int64_t *dist, +static void nonrd_pick_sb_modes(VP9_COMP *cpi, + TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile_info, mi_row, mi_col, bsize); mbmi = &xd->mi[0].src_mi->mbmi; mbmi->sb_type = bsize; @@ -2712,11 +2605,15 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) - set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize); + set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); else - vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx); + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, + rd_cost, bsize, ctx); duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + + if (rd_cost->rate == INT_MAX) + vp9_rd_cost_reset(rd_cost); } static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, @@ -2776,14 +2673,16 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, } } -static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, +static void nonrd_pick_partition(VP9_COMP *cpi, + TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, - int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd, + int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost, + int do_recon, int64_t best_rd, PC_TREE *pc_tree) { const SPEED_FEATURES *const sf = &cpi->sf; const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; @@ -2791,9 +2690,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, PICK_MODE_CONTEXT *ctx = &pc_tree->none; int i; BLOCK_SIZE subsize = bsize; - int this_rate, sum_rate = 0, best_rate = INT_MAX; - int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX; - int64_t sum_rd = 0; + RD_COST this_rdc, sum_rdc, best_rdc; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; // Override skipping rectangular partition operations for edge blocks @@ -2812,6 +2709,10 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); + vp9_rd_cost_init(&sum_rdc); + vp9_rd_cost_reset(&best_rdc); + best_rdc.rdcost = best_rd; + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (sf->auto_min_max_partition_size) { @@ -2832,17 +2733,19 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_NONE if (partition_none_allowed) { - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, bsize, ctx); + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, + &this_rdc, bsize, ctx); ctx->mic.mbmi = xd->mi[0].src_mi->mbmi; ctx->skip_txfm[0] = x->skip_txfm[0]; ctx->skip = x->skip; + ctx->pred_pixel_ready = 0; - if (this_rate != INT_MAX) { + if (this_rdc.rate != INT_MAX) { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += cpi->partition_cost[pl][PARTITION_NONE]; - sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); - if (sum_rd < best_rd) { + this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < best_rdc.rdcost) { int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr; int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr; @@ -2851,15 +2754,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, rate_breakout_thr *= num_pels_log2_lookup[bsize]; - best_rate = this_rate; - best_dist = this_dist; - best_rd = sum_rd; + best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; if (!x->e_mbd.lossless && - this_rate < rate_breakout_thr && - this_dist < dist_breakout_thr) { + this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { do_split = 0; do_rect = 0; } @@ -2871,35 +2772,34 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, store_pred_mv(x, ctx); // PARTITION_SPLIT - sum_rd = 0; if (do_split) { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); subsize = get_subsize(bsize, PARTITION_SPLIT); - for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) { const int x_idx = (i & 1) * ms; const int y_idx = (i >> 1) * ms; if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; load_pred_mv(x, ctx); - nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rate, &this_dist, 0, - best_rd - sum_rd, pc_tree->split[i]); + nonrd_pick_partition(cpi, tile_data, tp, + mi_row + y_idx, mi_col + x_idx, + subsize, &this_rdc, 0, + best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); - if (this_rate == INT_MAX) { - sum_rd = INT64_MAX; + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); } else { - sum_rate += this_rate; - sum_dist += this_dist; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost += this_rdc.rdcost; } } - if (sum_rd < best_rd) { - best_rate = sum_rate; - best_dist = sum_dist; - best_rd = sum_rd; + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_SPLIT; } else { // skip rectangular partition test when larger block size @@ -2915,40 +2815,39 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (sf->adaptive_motion_search) load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, subsize, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[0].skip = x->skip; + pc_tree->horizontal[0].pred_pixel_ready = 0; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); - - if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { + if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) { load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, - &this_rate, &this_dist, subsize, + nonrd_pick_sb_modes(cpi, tile_data, mi_row + ms, mi_col, + &this_rdc, subsize, &pc_tree->horizontal[1]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; + pc_tree->horizontal[1].pred_pixel_ready = 0; - if (this_rate == INT_MAX) { - sum_rd = INT64_MAX; + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); } else { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += cpi->partition_cost[pl][PARTITION_HORZ]; - sum_rate += this_rate; - sum_dist += this_dist; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + sum_rdc.rate, sum_rdc.dist); } } - if (sum_rd < best_rd) { - best_rd = sum_rd; - best_rate = sum_rate; - best_dist = sum_dist; + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_HORZ; } } @@ -2960,55 +2859,54 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (sf->adaptive_motion_search) load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, - &this_rate, &this_dist, subsize, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->vertical[0]); pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[0].skip = x->skip; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); - if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { + pc_tree->vertical[0].pred_pixel_ready = 0; + + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) { load_pred_mv(x, ctx); - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, - &this_rate, &this_dist, subsize, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + ms, + &this_rdc, subsize, &pc_tree->vertical[1]); pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[1].skip = x->skip; - if (this_rate == INT_MAX) { - sum_rd = INT64_MAX; + pc_tree->vertical[1].pred_pixel_ready = 0; + + if (this_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(&sum_rdc); } else { int pl = partition_plane_context(xd, mi_row, mi_col, bsize); - this_rate += cpi->partition_cost[pl][PARTITION_VERT]; - sum_rate += this_rate; - sum_dist += this_dist; - sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; + sum_rdc.rate += this_rdc.rate; + sum_rdc.dist += this_rdc.dist; + sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + sum_rdc.rate, sum_rdc.dist); } } - if (sum_rd < best_rd) { - best_rate = sum_rate; - best_dist = sum_dist; - best_rd = sum_rd; + + if (sum_rdc.rdcost < best_rdc.rdcost) { + best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_VERT; } } - // TODO(JBB): The following line is here just to avoid a static warning - // that occurs because at this point we never again reuse best_rd - // despite setting it here. The code should be refactored to avoid this. - (void) best_rd; - *rate = best_rate; - *dist = best_dist; + *rd_cost = best_rdc; - if (best_rate == INT_MAX) + if (best_rdc.rate == INT_MAX) { + vp9_rd_cost_reset(rd_cost); return; + } // update mode info array subsize = get_subsize(bsize, pc_tree->partitioning); fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize, pc_tree); - if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) { int output_enabled = (bsize == BLOCK_64X64); // Check the projected output rate for this SB against it's target @@ -3016,33 +2914,165 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // closer to the target. if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, - best_rate); + best_rdc.rate); } if (oxcf->aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, - best_rate, best_dist); + best_rdc.rate, best_rdc.dist); - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); + encode_sb_rt(cpi, tile_info, tp, mi_row, mi_col, output_enabled, + bsize, pc_tree); } if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); - assert(best_rate < INT_MAX); - assert(best_dist < INT64_MAX); + assert(best_rdc.rate < INT_MAX); + assert(best_rdc.dist < INT64_MAX); } else { assert(tp_orig == *tp); } } +static void nonrd_select_partition(VP9_COMP *cpi, + TileDataEnc *tile_data, + MODE_INFO *mi, + TOKENEXTRA **tp, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int output_enabled, + RD_COST *rd_cost, PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4; + const int mis = cm->mi_stride; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + RD_COST this_rdc; + + vp9_rd_cost_reset(&this_rdc); + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + subsize = (bsize >= BLOCK_8X8) ? mi[0].src_mi->mbmi.sb_type : BLOCK_4X4; + partition = partition_lookup[bsl][subsize]; + + if (bsize == BLOCK_32X32 && partition != PARTITION_NONE && + subsize >= BLOCK_16X16) { + cpi->sf.max_partition_size = BLOCK_32X32; + cpi->sf.min_partition_size = BLOCK_8X8; + nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, bsize, + rd_cost, 0, INT64_MAX, pc_tree); + } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) { + cpi->sf.max_partition_size = BLOCK_16X16; + cpi->sf.min_partition_size = BLOCK_8X8; + nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, bsize, + rd_cost, 0, INT64_MAX, pc_tree); + } else { + switch (partition) { + case PARTITION_NONE: + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, + subsize, &pc_tree->none); + pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; + pc_tree->none.skip = x->skip; + pc_tree->none.pred_pixel_ready = 1; + break; + case PARTITION_VERT: + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, + subsize, &pc_tree->vertical[0]); + pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + pc_tree->vertical[0].pred_pixel_ready = 1; + if (mi_col + hbs < cm->mi_cols) { + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs, + &this_rdc, subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + pc_tree->vertical[1].pred_pixel_ready = 1; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + } + break; + case PARTITION_HORZ: + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, + subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + pc_tree->horizontal[0].pred_pixel_ready = 1; + if (mi_row + hbs < cm->mi_rows) { + nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col, + &this_rdc, subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + pc_tree->horizontal[1].pred_pixel_ready = 1; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + } + break; + case PARTITION_SPLIT: + subsize = get_subsize(bsize, PARTITION_SPLIT); + nonrd_select_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + subsize, output_enabled, rd_cost, + pc_tree->split[0]); + nonrd_select_partition(cpi, tile_data, mi + hbs, tp, + mi_row, mi_col + hbs, subsize, output_enabled, + &this_rdc, pc_tree->split[1]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + nonrd_select_partition(cpi, tile_data, mi + hbs * mis, tp, + mi_row + hbs, mi_col, subsize, output_enabled, + &this_rdc, pc_tree->split[2]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + nonrd_select_partition(cpi, tile_data, mi + hbs * mis + hbs, tp, + mi_row + hbs, mi_col + hbs, subsize, + output_enabled, &this_rdc, pc_tree->split[3]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; + } + break; + default: + assert("Invalid partition type."); + break; + } + } + + if (bsize == BLOCK_64X64 && output_enabled) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, + rd_cost->rate, rd_cost->dist); + encode_sb_rt(cpi, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree); + } +} + + static void nonrd_use_partition(VP9_COMP *cpi, - const TileInfo *const tile, + TileDataEnc *tile_data, MODE_INFO *mi, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled, - int *totrate, int64_t *totdist, - PC_TREE *pc_tree) { + RD_COST *rd_cost, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -3050,9 +3080,9 @@ static void nonrd_use_partition(VP9_COMP *cpi, const int mis = cm->mi_stride; PARTITION_TYPE partition; BLOCK_SIZE subsize; - int rate = INT_MAX; - int64_t dist = INT64_MAX; + RD_COST this_rdc; + vp9_rd_cost_reset(&this_rdc); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -3061,78 +3091,78 @@ static void nonrd_use_partition(VP9_COMP *cpi, switch (partition) { case PARTITION_NONE: - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, subsize, &pc_tree->none); pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; pc_tree->none.skip = x->skip; break; case PARTITION_VERT: - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, subsize, &pc_tree->vertical[0]); pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[0].skip = x->skip; if (mi_col + hbs < cm->mi_cols) { - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs, - &rate, &dist, subsize, &pc_tree->vertical[1]); + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col + hbs, + &this_rdc, subsize, &pc_tree->vertical[1]); pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->vertical[1].skip = x->skip; - if (rate != INT_MAX && dist != INT64_MAX && - *totrate != INT_MAX && *totdist != INT64_MAX) { - *totrate += rate; - *totdist += dist; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; } } break; case PARTITION_HORZ: - nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + nonrd_pick_sb_modes(cpi, tile_data, mi_row, mi_col, rd_cost, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[0].skip = x->skip; if (mi_row + hbs < cm->mi_rows) { - nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col, - &rate, &dist, subsize, &pc_tree->horizontal[0]); + nonrd_pick_sb_modes(cpi, tile_data, mi_row + hbs, mi_col, + &this_rdc, subsize, &pc_tree->horizontal[0]); pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi; pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; pc_tree->horizontal[1].skip = x->skip; - if (rate != INT_MAX && dist != INT64_MAX && - *totrate != INT_MAX && *totdist != INT64_MAX) { - *totrate += rate; - *totdist += dist; + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; } } break; case PARTITION_SPLIT: subsize = get_subsize(bsize, PARTITION_SPLIT); - nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, - subsize, output_enabled, totrate, totdist, + nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + subsize, output_enabled, rd_cost, pc_tree->split[0]); - nonrd_use_partition(cpi, tile, mi + hbs, tp, + nonrd_use_partition(cpi, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, - &rate, &dist, pc_tree->split[1]); - if (rate != INT_MAX && dist != INT64_MAX && - *totrate != INT_MAX && *totdist != INT64_MAX) { - *totrate += rate; - *totdist += dist; + &this_rdc, pc_tree->split[1]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; } - nonrd_use_partition(cpi, tile, mi + hbs * mis, tp, + nonrd_use_partition(cpi, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - &rate, &dist, pc_tree->split[2]); - if (rate != INT_MAX && dist != INT64_MAX && - *totrate != INT_MAX && *totdist != INT64_MAX) { - *totrate += rate; - *totdist += dist; + &this_rdc, pc_tree->split[2]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; } - nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp, + nonrd_use_partition(cpi, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, output_enabled, - &rate, &dist, pc_tree->split[3]); - if (rate != INT_MAX && dist != INT64_MAX && - *totrate != INT_MAX && *totdist != INT64_MAX) { - *totrate += rate; - *totdist += dist; + &this_rdc, pc_tree->split[3]); + if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && + rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { + rd_cost->rate += this_rdc.rate; + rd_cost->dist += this_rdc.dist; } break; default: @@ -3143,15 +3173,19 @@ static void nonrd_use_partition(VP9_COMP *cpi, if (bsize == BLOCK_64X64 && output_enabled) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, - *totrate, *totdist); - encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree); + rd_cost->rate, rd_cost->dist); + encode_sb_rt(cpi, &tile_data->tile_info, tp, mi_row, mi_col, + 1, bsize, pc_tree); } } -static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp) { +static void encode_nonrd_sb_row(VP9_COMP *cpi, + TileDataEnc *tile_data, + int mi_row, + TOKENEXTRA **tp) { SPEED_FEATURES *const sf = &cpi->sf; VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; int mi_col; @@ -3161,53 +3195,55 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end; mi_col += MI_BLOCK_SIZE) { - int dummy_rate = 0; - int64_t dummy_dist = 0; + RD_COST dummy_rdc; const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO *mi = cm->mi + idx_str; BLOCK_SIZE bsize; x->in_static_area = 0; x->source_variance = UINT_MAX; vp9_zero(x->pred_mv); + vp9_rd_cost_init(&dummy_rdc); // Set the partition type of the 64X64 block switch (sf->partition_search_type) { case VAR_BASED_PARTITION: - choose_partitioning(cpi, tile, mi_row, mi_col); - nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist, cpi->pc_root); + choose_partitioning(cpi, tile_info, mi_row, mi_col); + nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root); break; case SOURCE_VAR_BASED_PARTITION: - set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col); - nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist, cpi->pc_root); + set_source_var_based_partition(cpi, tile_info, mi, mi_row, mi_col); + nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root); break; case FIXED_PARTITION: bsize = sf->partition_search_type == FIXED_PARTITION ? sf->always_this_block_size : get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col); - set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); - nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, - 1, &dummy_rate, &dummy_dist, cpi->pc_root); + set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); + nonrd_use_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root); break; case REFERENCE_PARTITION: - if (sf->partition_check || - !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) { - set_modeinfo_offsets(cm, xd, mi_row, mi_col); - auto_partition_range(cpi, tile, mi_row, mi_col, + set_offsets(cpi, tile_info, mi_row, mi_col, BLOCK_64X64); + x->in_static_area = is_background(cpi, tile_info, mi_row, mi_col); + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + xd->mi[0].src_mi->mbmi.segment_id && x->in_static_area) { + auto_partition_range(cpi, tile_info, mi_row, mi_col, &sf->min_partition_size, &sf->max_partition_size); - nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX, - cpi->pc_root); + nonrd_pick_partition(cpi, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, + INT64_MAX, cpi->pc_root); } else { - choose_partitioning(cpi, tile, mi_row, mi_col); - nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, - BLOCK_64X64, 1, &dummy_rate, &dummy_dist, - cpi->pc_root); + choose_partitioning(cpi, tile_info, mi_row, mi_col); + nonrd_select_partition(cpi, tile_data, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rdc, cpi->pc_root); } + break; default: assert(0); @@ -3343,43 +3379,64 @@ static int get_skip_encode_frame(const VP9_COMMON *cm) { cm->show_frame; } +static void tile_data_init(TileDataEnc *tile_data) { + int i, j; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = 32; + tile_data->mode_map[i][j] = j; + } + } +} + static void encode_tiles(VP9_COMP *cpi) { - const VP9_COMMON *const cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; int tile_col, tile_row; - TileInfo tile[4][1 << 6]; TOKENEXTRA *tok[4][1 << 6]; TOKENEXTRA *pre_tok = cpi->tok; int tile_tok = 0; + if (cpi->tile_data == NULL) { + CHECK_MEM_ERROR(cm, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) + tile_data_init(&cpi->tile_data[tile_row * tile_cols + tile_col]); + } + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col); + TileInfo *tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + vp9_tile_init(tile_info, cm, tile_row, tile_col); tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = tok[tile_row][tile_col]; - tile_tok = allocated_tokens(tile[tile_row][tile_col]); + tile_tok = allocated_tokens(*tile_info); } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const TileInfo * const ptile = &tile[tile_row][tile_col]; + const TileInfo * const tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; TOKENEXTRA * const old_tok = tok[tile_row][tile_col]; int mi_row; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end; + for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; mi_row += MI_BLOCK_SIZE) { if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm)) - encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]); + encode_nonrd_sb_row(cpi, this_tile, mi_row, &tok[tile_row][tile_col]); else - encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]); + encode_rd_sb_row(cpi, this_tile, mi_row, &tok[tile_row][tile_col]); } cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok[tile_row][tile_col] - old_tok); - assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile)); + assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*tile_info)); } } } @@ -3737,7 +3794,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf); } - if (!cpi->sf.reuse_inter_pred_sby || seg_skip) + if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip) vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index f5faa7c23..8ce30789f 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -29,12 +29,6 @@ struct optimize_ctx { ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; }; -struct encode_b_args { - MACROBLOCK *x; - struct optimize_ctx *ctx; - int8_t *skip; -}; - void vp9_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, @@ -802,7 +796,7 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { } } -static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; @@ -1040,18 +1034,10 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, *(args->skip) = 0; } -void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int8_t *skip) { - struct encode_b_args arg = {x, NULL, skip}; - encode_block_intra(plane, block, plane_bsize, tx_size, &arg); -} - - void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { const MACROBLOCKD *const xd = &x->e_mbd; struct encode_b_args arg = {x, NULL, &xd->mi[0].src_mi->mbmi.skip}; - vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra, - &arg); + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, + vp9_encode_block_intra, &arg); } diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 54d2b3751..97df8a66b 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -18,6 +18,11 @@ extern "C" { #endif +struct encode_b_args { + MACROBLOCK *x; + struct optimize_ctx *ctx; + int8_t *skip; +}; void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, @@ -29,9 +34,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); -void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int8_t *skip); +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index 089839567..f36d76e3d 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -163,7 +163,7 @@ static void write_mv_update(const vp9_tree_index *tree, void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) { int i, j; - nmv_context *const mvc = &cm->fc.nmvc; + nmv_context *const mvc = &cm->fc->nmvc; nmv_context_counts *const counts = &cm->counts.mv; write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index c5e872607..dfc636a41 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -134,7 +134,7 @@ static void setup_frame(VP9_COMP *cpi) { cpi->refresh_alt_ref_frame = 1; vp9_zero(cpi->interp_filter_selected); } else { - cm->fc = cm->frame_contexts[cm->frame_context_idx]; + *cm->fc = cm->frame_contexts[cm->frame_context_idx]; vp9_zero(cpi->interp_filter_selected[0]); } } @@ -160,6 +160,13 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i; + vpx_free(cm->fc); + cm->fc = NULL; + vpx_free(cm->frame_contexts); + cm->frame_contexts = NULL; + vpx_free(cpi->tile_data); + cpi->tile_data = NULL; + // Delete sementation map vpx_free(cpi->segmentation_map); cpi->segmentation_map = NULL; @@ -257,7 +264,7 @@ static void save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); - cc->fc = cm->fc; + cc->fc = *cm->fc; } static void restore_coding_context(VP9_COMP *cpi) { @@ -286,7 +293,7 @@ static void restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); - cm->fc = cc->fc; + *cm->fc = cc->fc; } static void configure_static_seg_features(VP9_COMP *cpi) { @@ -1374,6 +1381,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cm->error.setjmp = 1; + CHECK_MEM_ERROR(cm, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(cm, cm->frame_contexts, + (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, + sizeof(*cm->frame_contexts))); + cpi->use_svc = 0; init_config(cpi, oxcf); @@ -1381,6 +1394,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { cm->current_video_frame = 0; cpi->partition_search_skippable_frame = 0; + cpi->tile_data = NULL; // Create the encoder segmentation map and set all entries to 0 CHECK_MEM_ERROR(cm, cpi->segmentation_map, @@ -1435,6 +1449,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { #endif cpi->refresh_alt_ref_frame = 0; + cpi->multi_arf_last_grp_enabled = 0; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -2397,30 +2412,37 @@ void vp9_scale_references(VP9_COMP *cpi) { const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG}; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; - const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf; - // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1). - if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) && - (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) { - const int new_fb = get_free_fb(cm); - vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif // CONFIG_VP9_HIGHBITDEPTH - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) { + const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf; + #if CONFIG_VP9_HIGHBITDEPTH - scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf, - (int)cm->bit_depth); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + const int new_fb = get_free_fb(cm); + vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + cm->use_highbitdepth, + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf, + (int)cm->bit_depth); #else - scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); + if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { + const int new_fb = get_free_fb(cm); + vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); #endif // CONFIG_VP9_HIGHBITDEPTH - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + } else { + cpi->scaled_ref_idx[ref_frame - 1] = idx; + ++cm->frame_bufs[idx].ref_count; + } } else { - cpi->scaled_ref_idx[ref_frame - 1] = idx; - cm->frame_bufs[idx].ref_count++; + cpi->scaled_ref_idx[ref_frame - 1] = INVALID_REF_BUFFER_IDX; } } } @@ -2428,9 +2450,13 @@ void vp9_scale_references(VP9_COMP *cpi) { static void release_scaled_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int i; - - for (i = 0; i < 3; i++) - cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--; + for (i = 0; i < MAX_REF_FRAMES; ++i) { + const int idx = cpi->scaled_ref_idx[i]; + RefCntBuffer *const buf = + idx != INVALID_REF_BUFFER_IDX ? &cm->frame_bufs[idx] : NULL; + if (buf != NULL) + --buf->ref_count; + } } static void full_to_model_count(unsigned int *model_count, @@ -2515,10 +2541,181 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { } #endif -static void encode_without_recode_loop(VP9_COMP *cpi, - int q) { +static void set_mv_search_params(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const unsigned int max_mv_def = MIN(cm->width, cm->height); + + // Default based on max resolution. + cpi->mv_step_param = vp9_init_search_range(max_mv_def); + + if (cpi->sf.mv.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) { + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + cpi->mv_step_param = + vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + } + cpi->max_mv_magnitude = 0; + } + } +} + +static void set_size_dependent_vars(VP9_COMP *cpi, int *q, + int *bottom_index, int *top_index) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + + // Setup variables that depend on the dimensions of the frame. + set_mv_search_params(cpi); + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in the second pass of a two pass encode, as it requires + // lagged coding, and if the relevant speed feature flag is set. + if (oxcf->pass == 2 && cpi->sf.static_segmentation) + configure_static_seg_features(cpi); + +#if CONFIG_VP9_POSTPROC + if (oxcf->noise_sensitivity > 0) { + int l = 0; + switch (oxcf->noise_sensitivity) { + case 1: + l = 20; + break; + case 2: + l = 40; + break; + case 3: + l = 60; + break; + case 4: + case 5: + l = 100; + break; + case 6: + l = 150; + break; + } + vp9_denoise(cpi->Source, cpi->Source, l); + } +#endif // CONFIG_VP9_POSTPROC + + vp9_set_speed_features(cpi); + + vp9_set_rd_speed_thresholds(cpi); + vp9_set_rd_speed_thresholds_sub8x8(cpi); + + // Decide q and q bounds. + *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index); + + if (!frame_is_intra_only(cm)) { + cm->interp_filter = cpi->sf.default_interp_filter; + vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); + } +} + +static void init_motion_estimation(VP9_COMP *cpi) { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.mv.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride); + } else if (cpi->sf.mv.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + } +} + +extern void vbr_rate_correction(VP9_COMP *cpi, + int * this_frame_target, + const int64_t vbr_bits_off_target); + +void set_frame_size(VP9_COMP *cpi) { + int ref_frame; + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + + if ((oxcf->pass == 2) && + (!cpi->use_svc || + (is_two_pass_svc(cpi) && + cpi->svc.encode_empty_frame_state != ENCODING))) { + int target_rate = rc->base_frame_target; + if (oxcf->rc_mode == VPX_VBR) + vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target); + vp9_rc_set_frame_target(cpi, target_rate); + } + + if (oxcf->pass == 2 && + cm->current_video_frame == 0 && + oxcf->allow_spatial_resampling && + oxcf->rc_mode == VPX_VBR) { + // Internal scaling is triggered on the first frame. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } + + // Reset the frame pointers to the current frame size. + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + + alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf; + RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; + ref_buf->buf = buf; + ref_buf->idx = idx; +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + buf->y_crop_width, buf->y_crop_height, + cm->width, cm->height, + (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? + 1 : 0); +#else + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + buf->y_crop_width, buf->y_crop_height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + if (vp9_is_scaled(&ref_buf->sf)) + vp9_extend_frame_borders(buf); + } + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); +} + +static void encode_without_recode_loop(VP9_COMP *cpi) { + int q; + int bottom_index, top_index; // Dummy. VP9_COMMON *const cm = &cpi->common; + vp9_clear_system_state(); + + set_frame_size(cpi); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); + + if (cpi->unscaled_last_source != NULL) + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + + vp9_scale_references(cpi); + + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + vp9_set_quantizer(cm, q); setup_frame(cpi); // Variance adaptive and in frame q adjustment experiments are mutually @@ -2541,28 +2738,45 @@ static void encode_without_recode_loop(VP9_COMP *cpi, static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest, - int q, - int bottom_index, - int top_index) { + uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + int q; + int q_low, q_high; + int bottom_index, top_index; int loop_count = 0; int loop = 0; int overshoot_seen = 0; int undershoot_seen = 0; - int q_low = bottom_index, q_high = top_index; int frame_over_shoot_limit; int frame_under_shoot_limit; - // Decide frame size bounds - vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, - &frame_under_shoot_limit, - &frame_over_shoot_limit); - do { vp9_clear_system_state(); + if (loop_count == 0) { + set_frame_size(cpi); + + // Decide frame size bounds + vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); + + if (cpi->unscaled_last_source != NULL) + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + + vp9_scale_references(cpi); + + set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + + q_low = bottom_index; + q_high = top_index; + } + vp9_set_quantizer(cm, q); if (loop_count == 0) @@ -2804,25 +3018,6 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, } } -static int is_skippable_frame(const VP9_COMP *cpi) { - // If the current frame does not have non-zero motion vector detected in the - // first pass, and so do its previous and forward frames, then this frame - // can be skipped for partition check, and the partition size is assigned - // according to the variance - const SVC *const svc = &cpi->svc; - const TWO_PASS *const twopass = is_two_pass_svc(cpi) ? - &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; - - return (!frame_is_intra_only(&cpi->common) && - twopass->stats_in - 2 > twopass->stats_in_start && - twopass->stats_in < twopass->stats_in_end && - (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion - == 1 && - (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion - == 1 && - twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); -} - static void set_arf_sign_bias(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int arf_sign_bias; @@ -2839,31 +3034,6 @@ static void set_arf_sign_bias(VP9_COMP *cpi) { cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; } -static void set_mv_search_params(VP9_COMP *cpi) { - const VP9_COMMON *const cm = &cpi->common; - const unsigned int max_mv_def = MIN(cm->width, cm->height); - - // Default based on max resolution. - cpi->mv_step_param = vp9_init_search_range(max_mv_def); - - if (cpi->sf.mv.auto_mv_step_size) { - if (frame_is_intra_only(cm)) { - // Initialize max_mv_magnitude for use in the first INTER frame - // after a key/intra-only frame. - cpi->max_mv_magnitude = max_mv_def; - } else { - if (cm->show_frame) - // Allow mv_steps to correspond to twice the max mv magnitude found - // in the previous frame, capped by the default max_mv_magnitude based - // on resolution. - cpi->mv_step_param = - vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); - cpi->max_mv_magnitude = 0; - } - } -} - - int setup_interp_filter_search_mask(VP9_COMP *cpi) { INTERP_FILTER ifilter; int ref_total[MAX_REF_FRAMES] = {0}; @@ -2898,21 +3068,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, const VP9EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; TX_SIZE t; - int q; - int top_index; - int bottom_index; set_ext_overrides(cpi); - cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source); - - if (cpi->unscaled_last_source != NULL) - cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source); - - vp9_scale_references(cpi); - vp9_clear_system_state(); // Enable or disable mode based tweaking of the zbin. @@ -2927,14 +3085,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; - set_mv_search_params(cpi); - if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); - // Set various flags etc to special state if it is a key frame. if (frame_is_intra_only(cm)) { // Reset the loop filter deltas and segmentation map. @@ -2994,20 +3149,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } } - // Configure experimental use of segmentation for enhanced coding of - // static regions if indicated. - // Only allowed in second pass of two pass (as requires lagged coding) - // and if the relevant speed feature flag is set. - if (oxcf->pass == 2 && cpi->sf.static_segmentation) - configure_static_seg_features(cpi); - - // Check if the current frame is skippable for the partition search in the - // second pass according to the first pass stats - if (cpi->sf.allow_partition_search_skip && oxcf->pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } - // For 1 pass CBR, check if we are dropping this frame. // Never drop on key frame. if (oxcf->pass == 0 && @@ -3022,31 +3163,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_clear_system_state(); -#if CONFIG_VP9_POSTPROC - if (oxcf->noise_sensitivity > 0) { - int l = 0; - switch (oxcf->noise_sensitivity) { - case 1: - l = 20; - break; - case 2: - l = 40; - break; - case 3: - l = 60; - break; - case 4: - case 5: - l = 100; - break; - case 6: - l = 150; - break; - } - vp9_denoise(cpi->Source, cpi->Source, l); - } -#endif - #if CONFIG_INTERNAL_STATS { int i; @@ -3055,24 +3171,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } #endif - vp9_set_speed_features(cpi); - - vp9_set_rd_speed_thresholds(cpi); - vp9_set_rd_speed_thresholds_sub8x8(cpi); - - // Decide q and q bounds. - q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); - - if (!frame_is_intra_only(cm)) { - cm->interp_filter = cpi->sf.default_interp_filter; - /* TODO: Decide this more intelligently */ - vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH); - } - if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, q); + encode_without_recode_loop(cpi); } else { - encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index); + encode_with_recode_loop(cpi, size, dest); } #if CONFIG_VP9_TEMPORAL_DENOISING @@ -3215,16 +3317,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, vp9_twopass_postencode_update(cpi); } -static void init_motion_estimation(VP9_COMP *cpi) { - int y_stride = cpi->scaled_source.y_stride; - - if (cpi->sf.mv.search_method == NSTEP) { - vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride); - } else if (cpi->sf.mv.search_method == DIAMOND) { - vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); - } -} - static void check_initial_width(VP9_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, @@ -3243,10 +3335,11 @@ static void check_initial_width(VP9_COMP *cpi, alloc_ref_frame_buffers(cpi); alloc_util_frame_buffers(cpi); - init_motion_estimation(cpi); + init_motion_estimation(cpi); // TODO(agrange) This can be removed. cpi->initial_width = cm->width; cpi->initial_height = cm->height; + cpi->initial_mbs = cm->MBs; } } @@ -3388,14 +3481,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int64_t *time_stamp, int64_t *time_end, int flush) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; RATE_CONTROL *const rc = &cpi->rc; struct vpx_usec_timer cmptimer; YV12_BUFFER_CONFIG *force_src_buffer = NULL; struct lookahead_entry *last_source = NULL; struct lookahead_entry *source = NULL; - MV_REFERENCE_FRAME ref_frame; int arf_src_index; + int i; if (is_two_pass_svc(cpi)) { #if CONFIG_SPATIAL_SVC @@ -3416,11 +3508,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Note that at the moment multi_arf is only configured for 2 pass VBR and // will not work properly with svc. if ((oxcf->pass == 2) && !cpi->use_svc && - (cpi->oxcf.enable_auto_arf > 1) && (cpi->oxcf.rc_mode == VPX_VBR)) + (cpi->oxcf.enable_auto_arf > 1)) cpi->multi_arf_allowed = 1; else cpi->multi_arf_allowed = 0; - cpi->multi_arf_last_grp_enabled = 0; // Normal defaults cm->reset_frame_context = 0; @@ -3535,24 +3626,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, vp9_restore_layer_context(cpi); } - // start with a 0 size frame - *size = 0; - - /* find a free buffer for the new frame, releasing the reference previously - * held. - */ + // Find a free buffer for the new frame, releasing the reference previously + // held. cm->frame_bufs[cm->new_fb_idx].ref_count--; cm->new_fb_idx = get_free_fb(cm); - // For two pass encodes analyse the first pass stats and determine - // the bit allocation and other parameters for this frame / group of frames. - if ((oxcf->pass == 2) && - (!cpi->use_svc || - (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { - vp9_rc_get_second_pass_params(cpi); - } - if (!cpi->use_svc && cpi->multi_arf_allowed) { if (cm->frame_type == KEY_FRAME) { init_buffer_indices(cpi); @@ -3562,56 +3640,27 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } - cpi->frame_flags = *frame_flags; - - if (oxcf->pass == 2 && - cm->current_video_frame == 0 && - oxcf->allow_spatial_resampling && - oxcf->rc_mode == VPX_VBR) { - // Internal scaling is triggered on the first frame. - vp9_set_size_literal(cpi, oxcf->scaled_frame_width, - oxcf->scaled_frame_height); - } - - // Reset the frame pointers to the current frame size - vp9_realloc_frame_buffer(get_frame_new_buffer(cm), - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + // Start with a 0 size frame. + *size = 0; - alloc_util_frame_buffers(cpi); - init_motion_estimation(cpi); + cpi->frame_flags = *frame_flags; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; - YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf; - RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; - ref_buf->buf = buf; - ref_buf->idx = idx; -#if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame(&ref_buf->sf, - buf->y_crop_width, buf->y_crop_height, - cm->width, cm->height, - (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? - 1 : 0); -#else - vp9_setup_scale_factors_for_frame(&ref_buf->sf, - buf->y_crop_width, buf->y_crop_height, - cm->width, cm->height); -#endif // CONFIG_VP9_HIGHBITDEPTH - if (vp9_is_scaled(&ref_buf->sf)) - vp9_extend_frame_borders(buf); + if ((oxcf->pass == 2) && + (!cpi->use_svc || + (is_two_pass_svc(cpi) && + cpi->svc.encode_empty_frame_state != ENCODING))) { + vp9_rc_get_second_pass_params(cpi); + } else { + set_frame_size(cpi); } - set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); - if (oxcf->aq_mode == VARIANCE_AQ) { vp9_vaq_init(); } + for (i = 0; i < MAX_REF_FRAMES; ++i) + cpi->scaled_ref_idx[i] = INVALID_REF_BUFFER_IDX; + if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); @@ -3638,9 +3687,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } if (cm->refresh_frame_context) - cm->frame_contexts[cm->frame_context_idx] = cm->fc; + cm->frame_contexts[cm->frame_context_idx] = *cm->fc; - // Frame was dropped, release scaled references. + // No frame encoded, or frame was dropped, release scaled references. if (*size == 0) { release_scaled_references(cpi); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1e6047464..0e112f2ff 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -44,6 +44,7 @@ extern "C" { #endif #define DEFAULT_GF_INTERVAL 10 +#define INVALID_REF_BUFFER_IDX -1 // Marks an invalid reference buffer id. typedef struct { int nmvjointcost[MV_JOINTS]; @@ -122,7 +123,12 @@ typedef struct VP9EncoderConfig { int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: int speed; + // maximum allowed bitrate for any intra frame in % of bitrate target. unsigned int rc_max_intra_bitrate_pct; + // maximum allowed bitrate for any inter frame in % of bitrate target. + unsigned int rc_max_inter_bitrate_pct; + // percent of rate boost for golden frame in CBR mode. + unsigned int gf_cbr_boost_pct; MODE mode; int pass; @@ -223,6 +229,13 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } +// TODO(jingning) All spatially adaptive variables should go to TileDataEnc. +typedef struct TileDataEnc { + TileInfo tile_info; + int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + int mode_map[BLOCK_SIZES][MAX_MODES]; +} TileDataEnc; + typedef struct VP9_COMP { QUANTS quants; MACROBLOCK mb; @@ -238,10 +251,12 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG *unscaled_last_source; YV12_BUFFER_CONFIG scaled_last_source; + TileDataEnc *tile_data; + // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; - int scaled_ref_idx[3]; + int scaled_ref_idx[MAX_REF_FRAMES]; int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; @@ -374,6 +389,10 @@ typedef struct VP9_COMP { int initial_width; int initial_height; + int initial_mbs; // Number of MBs in the full-size frame; to be used to + // normalize the firstpass stats. This will differ from the + // number of MBs in the current frame when the frame is + // scaled. int use_svc; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index f1baf8323..c8c784b73 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -138,7 +138,7 @@ static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm, struct vpx_codec_cx_pkt pkt; pkt.kind = VPX_CODEC_FPMB_STATS_PKT; pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; - pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t); + pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t); vpx_codec_pkt_list_add(pktlist, &pkt); } #endif @@ -483,7 +483,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs); + vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs); } #endif @@ -934,12 +934,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_clear_system_state(); { FIRSTPASS_STATS fps; - // The minimum error here insures some bit alocation to frames even + // The minimum error here insures some bit allocation to frames even // in static regions. The allocation per MB declines for larger formats // where the typical "real" energy per MB also falls. // Initial estimate here uses sqrt(mbs) to define the min_err, where the - // number of mbs is propotional to image area. - const double min_err = 200 * sqrt(cm->MBs); + // number of mbs is proportional to the image area. + const int num_mbs = + cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); fps.frame = cm->current_video_frame; fps.spatial_layer_id = cpi->svc.spatial_layer_id; @@ -947,9 +949,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; fps.intra_error = (double)(intra_error >> 8) + min_err; fps.count = 1.0; - fps.pcnt_inter = (double)intercount / cm->MBs; - fps.pcnt_second_ref = (double)second_ref_count / cm->MBs; - fps.pcnt_neutral = (double)neutral_count / cm->MBs; + fps.pcnt_inter = (double)intercount / num_mbs; + fps.pcnt_second_ref = (double)second_ref_count / num_mbs; + fps.pcnt_neutral = (double)neutral_count / num_mbs; if (mvcount > 0) { fps.MVr = (double)sum_mvr / mvcount; @@ -960,7 +962,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount; fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); fps.new_mv_count = new_mv_count; - fps.pcnt_motion = (double)mvcount / cm->MBs; + fps.pcnt_motion = (double)mvcount / num_mbs; } else { fps.MVr = 0.0; fps.mvr_abs = 0.0; @@ -1074,7 +1076,8 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, if (section_target_bandwidth <= 0) { return rc->worst_quality; // Highest value allowed } else { - const int num_mbs = cpi->common.MBs; + const int num_mbs = + cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs; const double section_err = stats->coded_error / stats->count; const double err_per_mb = section_err / num_mbs; const double speed_term = 1.0 + 0.04 * oxcf->speed; @@ -1188,9 +1191,12 @@ void vp9_init_second_pass(VP9_COMP *cpi) { #define LOW_SR_DIFF_TRHESH 0.1 #define SR_DIFF_MAX 128.0 -static double get_sr_decay_rate(const VP9_COMMON *cm, +static double get_sr_decay_rate(const VP9_COMP *cpi, const FIRSTPASS_STATS *frame) { - double sr_diff = (frame->sr_coded_error - frame->coded_error) / cm->MBs; + const int num_mbs = + cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs; + double sr_diff = + (frame->sr_coded_error - frame->coded_error) / num_mbs; double sr_decay = 1.0; const double motion_amplitude_factor = frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); @@ -1207,19 +1213,19 @@ static double get_sr_decay_rate(const VP9_COMMON *cm, // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. -static double get_zero_motion_factor(const VP9_COMMON *cm, +static double get_zero_motion_factor(const VP9_COMP *cpi, const FIRSTPASS_STATS *frame) { const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; - double sr_decay = get_sr_decay_rate(cm, frame); + double sr_decay = get_sr_decay_rate(cpi, frame); return MIN(sr_decay, zero_motion_pct); } #define ZM_POWER_FACTOR 0.75 -static double get_prediction_decay_rate(const VP9_COMMON *cm, +static double get_prediction_decay_rate(const VP9_COMP *cpi, const FIRSTPASS_STATS *next_frame) { - const double sr_decay_rate = get_sr_decay_rate(cm, next_frame); + const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame); const double zero_motion_factor = (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), ZM_POWER_FACTOR)); @@ -1314,9 +1320,11 @@ static double calc_frame_boost(VP9_COMP *cpi, vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5); + const int num_mbs = + cpi->oxcf.allow_spatial_resampling ? cpi->initial_mbs : cpi->common.MBs; // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) / + frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); frame_boost = frame_boost * BOOST_FACTOR * boost_correction; @@ -1365,7 +1373,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } @@ -1404,7 +1412,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } @@ -1723,7 +1731,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_err -= gf_first_frame_err; // Motion breakout threshold for loop below depends on image size. - mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 4.0; + mv_ratio_accumulator_thresh = + (cpi->common.height + cpi->common.width) / 4.0; // Set a maximum and minimum interval for the GF group. // If the image appears almost completely static we can extend beyond this. @@ -1775,14 +1784,14 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Accumulate the effect of prediction quality decay. if (!flash_detected) { last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; // Monitor for static sections. zero_motion_accumulator = MIN(zero_motion_accumulator, - get_zero_motion_factor(&cpi->common, &next_frame)); + get_zero_motion_factor(cpi, &next_frame)); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -2048,8 +2057,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { break; // How fast is the prediction quality decaying? - loop_decay_rate = get_prediction_decay_rate(&cpi->common, - twopass->stats_in); + loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in); // We want to know something about the recent past... rather than // as used elsewhere where we are concerned with decay in prediction @@ -2160,7 +2168,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Monitor for static sections. zero_motion_accumulator = MIN(zero_motion_accumulator, - get_zero_motion_factor(&cpi->common, &next_frame)); + get_zero_motion_factor(cpi, &next_frame)); // Not all frames in the group are necessarily used in calculating boost. if ((i <= rc->max_gf_interval) || @@ -2171,7 +2179,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // How fast is prediction quality decaying. if (!detect_flash(twopass, 0)) { const double loop_decay_rate = - get_prediction_decay_rate(&cpi->common, &next_frame); + get_prediction_decay_rate(cpi, &next_frame); decay_accumulator *= loop_decay_rate; decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR); av_decay_accumulator += decay_accumulator; @@ -2295,6 +2303,24 @@ void configure_buffer_updates(VP9_COMP *cpi) { } } +int is_skippable_frame(const VP9_COMP *cpi) { + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + const SVC *const svc = &cpi->svc; + const TWO_PASS *const twopass = is_two_pass_svc(cpi) ? + &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; + + return (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_in_start && + twopass->stats_in < twopass->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion + == 1 && + (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion + == 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -2329,11 +2355,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; - // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR) - vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target); - - vp9_rc_set_frame_target(cpi, target_rate); cm->frame_type = INTER_FRAME; if (lc != NULL) { @@ -2347,6 +2368,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { } } + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && + cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + return; } @@ -2377,8 +2405,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { this_frame_copy = this_frame; // Keyframe and section processing. - if (rc->frames_to_key == 0 || - (cpi->frame_flags & FRAMEFLAGS_KEY)) { + if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { // Define next KF group and assign bits to it. find_next_key_frame(cpi, &this_frame_copy); } else { @@ -2431,6 +2458,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { configure_buffer_updates(cpi); + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && + (!cpi->use_svc || is_two_pass_svc(cpi))) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + target_rate = gf_group->bit_allocation[gf_group->index]; if (cpi->common.frame_type == KEY_FRAME) target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate); @@ -2439,18 +2473,11 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { rc->base_frame_target = target_rate; - // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR) - vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target); - - vp9_rc_set_frame_target(cpi, target_rate); - // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); } #define MINQ_ADJ_LIMIT 32 -#define Q_LIMIT_STEP 1 void vp9_twopass_postencode_update(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; RATE_CONTROL *const rc = &cpi->rc; @@ -2495,16 +2522,22 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { --twopass->extend_maxq; if (rc->rolling_target_bits >= rc->rolling_actual_bits) - twopass->extend_minq += Q_LIMIT_STEP; + ++twopass->extend_minq; // Overshoot. } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { --twopass->extend_minq; if (rc->rolling_target_bits < rc->rolling_actual_bits) - twopass->extend_maxq += Q_LIMIT_STEP; + ++twopass->extend_maxq; } else { + // Adjustment for extreme local overshoot. + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + + // Unwind undershoot or overshoot adjustment. if (rc->rolling_target_bits < rc->rolling_actual_bits) --twopass->extend_minq; - if (rc->rolling_target_bits > rc->rolling_actual_bits) + else if (rc->rolling_target_bits > rc->rolling_actual_bits) --twopass->extend_maxq; } twopass->extend_minq = clamp(twopass->extend_minq, 0, MINQ_ADJ_LIMIT); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index ae9ed66cd..28f12916e 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -452,7 +452,8 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, args->dist += dist; } -static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = { +static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = { + {THR_DC, THR_H_PRED, THR_V_PRED}, {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV}, {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG}, {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA}, @@ -461,13 +462,11 @@ static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = { // TODO(jingning) placeholder for inter-frame non-RD mode decision. // this needs various further optimizations. to be continued.. void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - int mi_row, int mi_col, - int *returnrate, - int64_t *returndistortion, - BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx) { + TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi; struct macroblockd_plane *const pd = &xd->plane[0]; @@ -480,11 +479,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; - int64_t best_rd = INT64_MAX; - int64_t this_rd = INT64_MAX; + RD_COST this_rdc, best_rdc; uint8_t skip_txfm = 0; - int rate = INT_MAX; - int64_t dist = INT64_MAX; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; @@ -500,8 +496,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, const int8_t segment_id = mbmi->segment_id; const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; - const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize]; - INTERP_FILTER filter_ref = cm->interp_filter; + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + INTERP_FILTER filter_ref; const int bsl = mi_width_log2_lookup[bsize]; const int pred_filter_search = cm->interp_filter == SWITCHABLE ? (((mi_row + mi_col) >> bsl) + @@ -544,9 +540,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; x->skip = 0; + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter; + else + filter_ref = cm->interp_filter; + // initialize mode decisions - *returnrate = INT_MAX; - *returndistortion = INT64_MAX; + vp9_rd_cost_reset(&best_rdc); + vp9_rd_cost_reset(&this_rdc); + vp9_rd_cost_reset(rd_cost); vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO)); mbmi->sb_type = bsize; mbmi->ref_frame[0] = NONE; @@ -557,17 +561,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, EIGHTTAP : cm->interp_filter; mbmi->segment_id = segment_id; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; x->pred_mv_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; - if (xd->up_available) - filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter; - else if (xd->left_available) - filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); int_mv *const candidates = mbmi->ref_mvs[ref_frame]; @@ -576,10 +575,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, sf, sf); if (!cm->error_resilient_mode) - vp9_find_mv_refs(cm, xd, tile, xd->mi[0].src_mi, ref_frame, + vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame, candidates, mi_row, mi_col); else - const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0].src_mi, + const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info, + xd->mi[0].src_mi, ref_frame, candidates, mi_row, mi_col); @@ -606,27 +606,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int rate_mv = 0; int mode_rd_thresh; - if (const_motion[ref_frame] && - (this_mode == NEARMV || this_mode == ZEROMV)) + if (const_motion[ref_frame] && this_mode == NEARMV) continue; if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; mode_rd_thresh = - rd_threshes[mode_idx[ref_frame - - LAST_FRAME][INTER_OFFSET(this_mode)]]; - if (rd_less_than_thresh(best_rd, mode_rd_thresh, + rd_threshes[mode_idx[ref_frame][INTER_OFFSET(this_mode)]]; + if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, rd_thresh_freq_fact[this_mode])) continue; if (this_mode == NEWMV) { + if (ref_frame > LAST_FRAME) + continue; if (cpi->sf.partition_search_type != VAR_BASED_PARTITION && - this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize])) + this_rdc.rdcost < (int64_t)(1 << num_pels_log2_lookup[bsize])) continue; if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rd)) + &rate_mv, best_rdc.rdcost)) continue; } @@ -642,7 +642,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // motion vector is at sub-pixel accuracy level for luma component, i.e., // the last three bits are all zeros. if (cpi->sf.reuse_inter_pred_sby) { - if (this_mode == NEARESTMV) { + if (!this_mode_pred) { this_mode_pred = &tmp[3]; } else { this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; @@ -699,30 +699,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interp_filter = best_filter; mbmi->tx_size = pf_tx_size[mbmi->interp_filter]; - rate = pf_rate[mbmi->interp_filter]; - dist = pf_dist[mbmi->interp_filter]; + this_rdc.rate = pf_rate[mbmi->interp_filter]; + this_rdc.dist = pf_dist[mbmi->interp_filter]; var_y = pf_var[mbmi->interp_filter]; sse_y = pf_sse[mbmi->interp_filter]; x->skip_txfm[0] = skip_txfm; } else { mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y); } - rate += rate_mv; - rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] + this_rdc.rate += rate_mv; + this_rdc.rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + this_rdc.rate, this_rdc.dist); // Skipping checking: test to see if this block can be reconstructed by // prediction only. if (cpi->allow_encode_breakout) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, - this_mode, var_y, sse_y, yv12_mb, &rate, &dist); + this_mode, var_y, sse_y, yv12_mb, + &this_rdc.rate, &this_rdc.dist); if (x->skip) { - rate += rate_mv; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + this_rdc.rate += rate_mv; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + this_rdc.rate, this_rdc.dist); } } @@ -734,10 +738,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (void)ctx; #endif - if (this_rd < best_rd || x->skip) { - best_rd = this_rd; - *returnrate = rate; - *returndistortion = dist; + if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { + best_rdc = this_rdc; best_mode = this_mode; best_pred_filter = mbmi->interp_filter; best_tx_size = mbmi->tx_size; @@ -757,10 +759,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (x->skip) break; } - // If the current reference frame is valid and we found a usable mode, - // we are done. - if (best_rd < INT64_MAX) - break; + + // Check that a prediction mode has been selected. + assert(best_rdc.rdcost < INT64_MAX); } // If best prediction is not in dst buf, then copy the prediction block from @@ -792,7 +793,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Perform intra prediction search, if the best SAD is above a certain // threshold. - if (!x->skip && best_rd > inter_mode_thresh && + if (!x->skip && best_rdc.rdcost > inter_mode_thresh && bsize <= cpi->sf.max_intra_bsize) { PREDICTION_MODE this_mode; struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 }; @@ -814,16 +815,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra, &args); mbmi->tx_size = saved_tx_size; - rate = args.rate; - dist = args.dist; - rate += cpi->mbmode_cost[this_mode]; - rate += intra_cost_penalty; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); - - if (this_rd + intra_mode_cost < best_rd) { - best_rd = this_rd; - *returnrate = rate; - *returndistortion = dist; + this_rdc.rate = args.rate; + this_rdc.dist = args.dist; + this_rdc.rate += cpi->mbmode_cost[this_mode]; + this_rdc.rate += intra_cost_penalty; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + this_rdc.rate, this_rdc.dist); + + if (this_rdc.rdcost + intra_mode_cost < best_rdc.rdcost) { + best_rdc = this_rdc; mbmi->mode = this_mode; mbmi->tx_size = intra_tx_size; mbmi->ref_frame[0] = INTRA_FRAME; @@ -836,4 +836,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->sf.reuse_inter_pred_sby) pd->dst = orig_dst; } + + if (is_inter_block(mbmi)) + vp9_update_rd_thresh_fact(cpi, tile_data, bsize, + mode_idx[ref_frame][INTER_OFFSET(mbmi->mode)]); + else + vp9_update_rd_thresh_fact(cpi, tile_data, bsize, + mode_idx[ref_frame][mbmi->mode]); + + *rd_cost = best_rdc; } diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h index 97aeca76a..23d347d94 100644 --- a/vp9/encoder/vp9_pickmode.h +++ b/vp9/encoder/vp9_pickmode.h @@ -18,10 +18,8 @@ extern "C" { #endif void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - const struct TileInfo *const tile, - int mi_row, int mi_col, - int *returnrate, - int64_t *returndistortion, + TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 65bca669a..8a5b6114c 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -196,6 +196,7 @@ static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; const int min_frame_target = MAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); if (target < min_frame_target) @@ -210,6 +211,11 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = rc->avg_frame_bandwidth * + oxcf->rc_max_inter_bitrate_pct / 100; + target = MIN(target, max_rate); + } return target; } @@ -971,7 +977,13 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) + active_best_quality = (active_best_quality + cq_level + 1) / 2; } } else { active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); @@ -1327,7 +1339,18 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); - int target = rc->avg_frame_bandwidth; + int target; + + if (oxcf->gf_cbr_boost_pct) { + const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; + target = cpi->refresh_golden_frame ? + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) : + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = rc->avg_frame_bandwidth; + } if (svc->number_temporal_layers > 1 && oxcf->rc_mode == VPX_CBR) { // Note that for layers, avg_frame_bandwidth is the cumulative @@ -1347,6 +1370,11 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct); target += (target * pct_high) / 200; } + if (oxcf->rc_max_inter_bitrate_pct) { + const int max_rate = rc->avg_frame_bandwidth * + oxcf->rc_max_inter_bitrate_pct / 100; + target = MIN(target, max_rate); + } return MAX(min_frame_target, target); } @@ -1436,15 +1464,25 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; - target = calc_iframe_target_size_one_pass_cbr(cpi); } else { cm->frame_type = INTER_FRAME; - target = calc_pframe_target_size_one_pass_cbr(cpi); } + if (rc->frames_till_gf_update_due == 0) { + rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + + if (cm->frame_type == KEY_FRAME) + target = calc_iframe_target_size_one_pass_cbr(cpi); + else + target = calc_pframe_target_size_one_pass_cbr(cpi); + vp9_rc_set_frame_target(cpi, target); - // Don't use gf_update by default in CBR mode. - rc->frames_till_gf_update_due = INT_MAX; - rc->baseline_gf_interval = INT_MAX; } int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 7f526fc42..13e317d6d 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -65,7 +65,7 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = { }; static void fill_mode_costs(VP9_COMP *cpi) { - const FRAME_CONTEXT *const fc = &cpi->common.fc; + const FRAME_CONTEXT *const fc = cpi->common.fc; int i, j; for (i = 0; i < INTRA_MODES; ++i) @@ -280,7 +280,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { set_block_thresholds(cm, rd); if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) { - fill_token_costs(x->token_costs, cm->fc.coef_probs); + fill_token_costs(x->token_costs, cm->fc->coef_probs); for (i = 0; i < PARTITION_CONTEXTS; ++i) vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i), @@ -295,11 +295,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { vp9_build_nmv_cost_table(x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, - &cm->fc.nmvc, cm->allow_high_precision_mv); + &cm->fc->nmvc, cm->allow_high_precision_mv); for (i = 0; i < INTER_MODE_CONTEXTS; ++i) vp9_cost_tokens((int *)cpi->inter_mode_cost[i], - cm->fc.inter_mode_probs[i], vp9_inter_mode_tree); + cm->fc->inter_mode_probs[i], vp9_inter_mode_tree); } } } @@ -594,21 +594,38 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { const SPEED_FEATURES *const sf = &cpi->sf; RD_OPT *const rd = &cpi->rd; int i; + static const int thresh_mult[2][MAX_REFS] = + {{2500, 2500, 2500, 4500, 4500, 2500}, + {2000, 2000, 2000, 4000, 4000, 2000}}; + + for (i = 0; i < MAX_REFS; ++i) { + rd->thresh_mult_sub8x8[i] = + (sf->disable_split_mask & (1 << i)) ? + INT_MAX : thresh_mult[cpi->oxcf.mode == BEST][i]; + } +} - for (i = 0; i < MAX_REFS; ++i) - rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0; - - rd->thresh_mult_sub8x8[THR_LAST] += 2500; - rd->thresh_mult_sub8x8[THR_GOLD] += 2500; - rd->thresh_mult_sub8x8[THR_ALTR] += 2500; - rd->thresh_mult_sub8x8[THR_INTRA] += 2500; - rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500; - rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500; - - // Check for masked out split cases. - for (i = 0; i < MAX_REFS; ++i) - if (sf->disable_split_mask & (1 << i)) - rd->thresh_mult_sub8x8[i] = INT_MAX; +// TODO(jingning) Refactor this function. Use targeted smaller struct as inputs. +void vp9_update_rd_thresh_fact(VP9_COMP *cpi, TileDataEnc *tile_data, + int bsize, int best_mode_index) { + if (cpi->sf.adaptive_rd_thresh > 0) { + const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; + int mode; + for (mode = 0; mode < top_mode; ++mode) { + const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4); + const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64); + BLOCK_SIZE bs; + for (bs = min_size; bs <= max_size; ++bs) { + int *const fact = &tile_data->thresh_freq_fact[bs][mode]; + if (mode == best_mode_index) { + *fact -= (*fact >> 4); + } else { + *fact = MIN(*fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } + } + } } int vp9_get_intra_cost_penalty(int qindex, int qdelta, diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 1aa52663a..aecca0b43 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -36,6 +36,9 @@ extern "C" { #define MAX_MODES 30 #define MAX_REFS 6 +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { @@ -129,6 +132,7 @@ void vp9_rd_cost_reset(RD_COST *rd_cost); void vp9_rd_cost_init(RD_COST *rd_cost); struct TileInfo; +struct TileDataEnc; struct VP9_COMP; struct macroblock; @@ -158,6 +162,10 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); +void vp9_update_rd_thresh_fact(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, + int bsize, int best_mode_index); + static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, int thresh_fact) { return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index eca8e5880..e80f345e8 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -38,9 +38,6 @@ #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_variance.h" -#define RD_THRESH_MAX_FACT 64 -#define RD_THRESH_INC 1 - #define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \ (1 << INTRA_FRAME)) #define GOLDEN_FRAME_MODE_MASK ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \ @@ -478,7 +475,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, return; if (!is_inter_block(mbmi)) { - vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); + struct encode_b_args arg = {x, NULL, &mbmi->skip}; + vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { dist_block(plane, block, tx_size, args, xd->bd); @@ -639,7 +637,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_rd = INT64_MAX; TX_SIZE best_tx = max_tx_size; - const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); + const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); @@ -2765,35 +2763,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); } -static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize, - int best_mode_index) { - if (cpi->sf.adaptive_rd_thresh > 0) { - const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; - int mode; - for (mode = 0; mode < top_mode; ++mode) { - const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4); - const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64); - BLOCK_SIZE bs; - for (bs = min_size; bs <= max_size; ++bs) { - int *const fact = &cpi->rd.thresh_freq_fact[bs][mode]; - if (mode == best_mode_index) { - *fact -= (*fact >> 4); - } else { - *fact = MIN(*fact + RD_THRESH_INC, - cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); - } - } - } - } -} - -void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, +void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, + TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; RD_OPT *const rd_opt = &cpi->rd; SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -2836,9 +2814,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; int mode_skip_start = sf->mode_skip_start + 1; const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; - const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize]; + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *mode_map = rd_opt->mode_map[bsize]; + int *mode_map = tile_data->mode_map[bsize]; const int mode_search_skip_flags = sf->mode_search_skip_flags; vp9_zero(best_mbmode); @@ -2869,7 +2847,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col, + setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; @@ -3023,9 +3001,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (sf->motion_field_mode_search) { const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize], - tile->mi_col_end - mi_col); + tile_info->mi_col_end - mi_col); const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize], - tile->mi_row_end - mi_row); + tile_info->mi_row_end - mi_row); const int bsl = mi_width_log2_lookup[bsize]; int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm->current_video_frame)) & 0x1; @@ -3036,7 +3014,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int_mv ref_mv; ref_mv.as_int = INVALID_MV; - if ((mi_row - 1) >= tile->mi_row_start) { + if ((mi_row - 1) >= tile_info->mi_row_start) { ref_mv = xd->mi[-xd->mi_stride].src_mi->mbmi.mv[0]; rf = xd->mi[-xd->mi_stride].src_mi->mbmi.ref_frame[0]; for (i = 0; i < mi_width; ++i) { @@ -3047,7 +3025,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - if ((mi_col - 1) >= tile->mi_col_start) { + if ((mi_col - 1) >= tile_info->mi_col_start) { if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1].src_mi->mbmi.mv[0]; if (rf == NONE) @@ -3420,7 +3398,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, !is_inter_block(&best_mbmode)); if (!cpi->rc.is_src_frame_alt_ref) - update_rd_thresh_fact(cpi, bsize, best_mode_index); + vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_mode_index); // macroblock modes *mbmi = best_mbmode; @@ -3479,7 +3457,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_tx_diff, best_filter_diff, best_mode_skippable); } -void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, +void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, + TileDataEnc *tile_data, + MACROBLOCK *x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, @@ -3573,7 +3553,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, assert((cm->interp_filter == SWITCHABLE) || (cm->interp_filter == mbmi->interp_filter)); - update_rd_thresh_fact(cpi, bsize, THR_ZEROMV); + vp9_update_rd_thresh_fact(cpi, tile_data, bsize, THR_ZEROMV); vp9_zero(best_pred_diff); vp9_zero(best_filter_diff); @@ -3585,14 +3565,16 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, best_pred_diff, best_tx_diff, best_filter_diff, 0); } -void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, +void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, + TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; RD_OPT *const rd_opt = &cpi->rd; SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -3651,10 +3633,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile, - ref_frame, bsize, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], - yv12_mb); + setup_buffer_inter(cpi, x, tile_info, + ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], + yv12_mb); } else { ref_frame_skip_mask[0] |= (1 << ref_frame); ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; @@ -3712,7 +3694,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // Test best rd so far against threshold for trying this mode. if (rd_less_than_thresh(best_rd, rd_opt->threshes[segment_id][bsize][ref_index], - rd_opt->thresh_freq_fact[bsize][ref_index])) + tile_data->thresh_freq_fact[bsize][ref_index])) continue; comp_pred = second_ref_frame > INTRA_FRAME; @@ -3845,7 +3827,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int newbest, rs; int64_t rs_rd; mbmi->interp_filter = switchable_filter_index; - tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, &rate, &rate_y, &distortion, @@ -3911,7 +3893,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, &rate, &rate_y, &distortion, &skippable, &total_sse, @@ -4146,7 +4128,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, (cm->interp_filter == best_mbmode.interp_filter) || !is_inter_block(&best_mbmode)); - update_rd_thresh_fact(cpi, bsize, best_ref_index); + vp9_update_rd_thresh_fact(cpi, tile_data, bsize, best_ref_index); // macroblock modes *mbmi = best_mbmode; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index ed38ce81a..7bbc3c89a 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -29,14 +29,16 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); -void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, - const struct TileInfo *const tile, +void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -44,8 +46,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi, int64_t best_rd_so_far); void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, + struct TileDataEnc *tile_data, struct macroblock *x, - const struct TileInfo *const tile, int mi_row, int mi_col, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 9e3ee2c94..3315aa6a1 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -142,8 +142,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, if (speed >= 5) { int i; - - sf->partition_search_type = FIXED_PARTITION; sf->optimize_coefficients = 0; sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; @@ -151,8 +149,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_y_mode_mask[i] = INTRA_DC; sf->intra_uv_mode_mask[i] = INTRA_DC; } - } - if (speed >= 6) { + sf->partition_search_breakout_rate_thr = 500; sf->mv.reduce_first_step_size = 1; } } @@ -205,7 +202,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->disable_filter_search_var_thresh = 50; sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; @@ -217,8 +213,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, if (speed >= 3) { sf->use_square_partition_only = 1; sf->disable_filter_search_var_thresh = 100; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; - sf->constrain_copy_partition = 1; sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; sf->mv.subpel_iters_per_step = 1; @@ -263,8 +257,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, : STRICT_NEIGHBORING_MIN_MAX; sf->max_partition_size = BLOCK_32X32; sf->min_partition_size = BLOCK_8X8; - sf->partition_check = - (frames_since_key % sf->last_partitioning_redo_frequency == 1); sf->force_frame_boost = is_keyframe || (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); sf->max_delta_qindex = is_keyframe ? 20 : 15; @@ -275,6 +267,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; + sf->adaptive_rd_thresh = 2; + // This feature is only enabled when partition search is disabled. + sf->reuse_inter_pred_sby = 1; if (MIN(cm->width, cm->height) >= 720) sf->partition_search_breakout_dist_thr = (1 << 25); @@ -295,15 +290,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->partition_search_type = VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; sf->mv.search_method = NSTEP; - sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; - - // This feature is only enabled when partition search is disabled. - sf->reuse_inter_pred_sby = 1; - - // Increase mode checking threshold for NEWMV. - sf->elevate_newmv_thresh = 1000; - sf->mv.reduce_first_step_size = 1; } @@ -348,7 +335,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->mv.fullpel_search_step_param = 6; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; @@ -368,7 +354,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; sf->last_partitioning_redo_frequency = 4; - sf->constrain_copy_partition = 0; sf->disable_split_mask = 0; sf->mode_search_skip_flags = 0; sf->force_frame_boost = 0; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 951b4af22..a314f6040 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -93,12 +93,6 @@ typedef enum { } MOTION_THRESHOLD; typedef enum { - LAST_FRAME_PARTITION_OFF = 0, - LAST_FRAME_PARTITION_LOW_MOTION = 1, - LAST_FRAME_PARTITION_ALL = 2 -} LAST_FRAME_PARTITION_METHOD; - -typedef enum { USE_FULL_RD = 0, USE_LARGESTALL, USE_TX_8X8 @@ -242,15 +236,6 @@ typedef struct SPEED_FEATURES { // level within a frame. int allow_skip_recode; - // This variable allows us to reuse the last frames partition choices - // (64x64 v 32x32 etc) for this frame. It can be set to only use the last - // frame as a starting point in low motion scenes or always use it. If set - // we use last partitioning_redo frequency to determine how often to redo - // the partitioning from scratch. Adjust_partitioning_from_last_frame - // enables us to adjust up or down one partitioning from the last frames - // partitioning. - LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning; - // The threshold is to determine how slow the motino is, it is used when // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION MOTION_THRESHOLD lf_motion_threshold; @@ -264,8 +249,6 @@ typedef struct SPEED_FEATURES { // precise but significantly faster than the non lp version. int use_lp32x32fdct; - // TODO(JBB): remove this as its no longer used. - // After looking at the first set of modes (set by index here), skip // checking modes for reference frames that don't match the reference frame // of the best so far. @@ -303,12 +286,6 @@ typedef struct SPEED_FEATURES { // use_lastframe_partitioning is set. int last_partitioning_redo_frequency; - // This enables constrained copy partitioning, which, given an input block - // size bsize, will copy previous partition for partitions less than bsize, - // otherwise bsize partition is used. bsize is currently set to 16x16. - // Used for the case where motion is detected in superblock. - int constrain_copy_partition; - // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable // it always, to allow it for only Last frame and Intra, disable it for all // inter modes or to enable it always. @@ -342,10 +319,6 @@ typedef struct SPEED_FEATURES { // Fast quantization process path int use_quant_fp; - // Search through variable block partition types in non-RD mode decision - // encoding process for RTC. - int partition_check; - // Use finer quantizer in every other few frames that run variable block // partition type search. int force_frame_boost; diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index adf01bf35..0166a50a0 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -313,7 +313,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = cpi->coef_counts[tx_size][type][ref]; vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = - cpi->common.fc.coef_probs[tx_size][type][ref]; + cpi->common.fc->coef_probs[tx_size][type][ref]; unsigned int (*const eob_branch)[COEFF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size][type][ref]; const uint8_t *const band = get_band_translate(tx_size); diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c index bf5fa889f..4ddee7b74 100644 --- a/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -23,18 +23,17 @@ // Compute the sum of all pixel differences of this MB. static INLINE int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); - const __m128i acc_diff_lo = _mm_srai_epi16( - _mm_unpacklo_epi8(acc_diff, acc_diff), 8); - const __m128i acc_diff_hi = _mm_srai_epi16( - _mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_lo = + _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = + _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); - const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, - _mm_srli_si128(hg_fe_dc_ba, 8)); - const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, - _mm_srli_si128(hgfe_dcba, 4)); - int sum_diff = _mm_cvtsi128_si32(hgfedcba); - return sum_diff; + const __m128i hgfe_dcba = + _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = + _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); + return _mm_cvtsi128_si32(hgfedcba); } // Denoise a 16x1 vector. @@ -51,8 +50,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); - const __m128i v_mc_running_avg_y = _mm_loadu_si128( - (const __m128i *)(&mc_running_avg_y[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); @@ -60,8 +59,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. - const __m128i clamped_absdiff = _mm_min_epu8( - _mm_or_si128(pdiff, ndiff), *k_16); + const __m128i clamped_absdiff = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); @@ -95,24 +94,22 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig, } // Denoise a 16x1 vector with a weaker filter. -static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig, - const uint8_t *mc_running_avg_y, - uint8_t *running_avg_y, - const __m128i k_0, - const __m128i k_delta, - __m128i acc_diff) { +static INLINE __m128i vp9_denoiser_adj_16x1_sse2( + const uint8_t *sig, const uint8_t *mc_running_avg_y, + uint8_t *running_avg_y, const __m128i k_0, + const __m128i k_delta, __m128i acc_diff) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = - _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); + _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = - _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); @@ -128,19 +125,16 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig, return acc_diff; } -static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride, - const uint8_t *mc_running_avg_y, - int mc_avg_y_stride, - uint8_t *running_avg_y, int avg_y_stride, - int increase_denoising, - BLOCK_SIZE bs, - int motion_magnitude) { - int sum_diff_thresh; - int r; - int shift_inc = (increase_denoising && - motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; - unsigned char sig_buffer[2][16], mc_running_buffer[2][16], - running_buffer[2][16]; +// Denoiser for 4xM and 8xM blocks. +static int vp9_denoiser_NxM_sse2_small( + const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 1 : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); @@ -148,145 +142,51 @@ static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride, const __m128i k_16 = _mm_set1_epi8(16); // Modify each level's adjustment according to motion_magnitude. const __m128i l3 = _mm_set1_epi8( - (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? - 7 + shift_inc : 6); + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); // Difference between level 3 and level 2 is 2. const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); - int sum_diff = 0; - - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) { - vpx_memcpy(sig_buffer[r], sig, 4); - vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4); - vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4); - vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4); - vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4); - vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y + - mc_avg_y_stride, 4); - vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y + - mc_avg_y_stride * 2, 4); - vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y + - mc_avg_y_stride * 3, 4); - vpx_memcpy(running_buffer[r], running_avg_y, 4); - vpx_memcpy(running_buffer[r] + 4, running_avg_y + - avg_y_stride, 4); - vpx_memcpy(running_buffer[r] + 8, running_avg_y + - avg_y_stride * 2, 4); - vpx_memcpy(running_buffer[r] + 12, running_avg_y + - avg_y_stride * 3, 4); - acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], - mc_running_buffer[r], - running_buffer[r], - &k_0, &k_4, &k_8, &k_16, - &l3, &l32, &l21, acc_diff); - vpx_memcpy(running_avg_y, running_buffer[r], 4); - vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4); - vpx_memcpy(running_avg_y + avg_y_stride * 2, - running_buffer[r] + 8, 4); - vpx_memcpy(running_avg_y + avg_y_stride * 3, - running_buffer[r] + 12, 4); - // Update pointers for next iteration. - sig += (sig_stride << 2); - mc_running_avg_y += (mc_avg_y_stride << 2); - running_avg_y += (avg_y_stride << 2); - } - - { - sum_diff = sum_diff_16x1(acc_diff); - sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); - if (abs(sum_diff) > sum_diff_thresh) { - // Before returning to copy the block (i.e., apply no denoising), - // checK if we can still apply some (weaker) temporal filtering to - // this block, that would otherwise not be denoised at all. Simplest - // is to apply an additional adjustment to running_avg_y to bring it - // closer to sig. The adjustment is capped by a maximum delta, and - // chosen such that in most cases the resulting sum_diff will be - // within the accceptable range given by sum_diff_thresh. + const uint8_t shift = (width == 4) ? 2 : 1; - // The delta is set by the excess of absolute pixel diff over the - // threshold. - int delta = ((abs(sum_diff) - sum_diff_thresh) - >> num_pels_log2_lookup[bs]) + 1; - // Only apply the adjustment for max delta up to 3. - if (delta < 4) { - const __m128i k_delta = _mm_set1_epi8(delta); - running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); - sum_diff = 0; - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) { - acc_diff = vp9_denoiser_adj_16x1_sse2( - sig_buffer[r], mc_running_buffer[r], - running_buffer[r], k_0, k_delta, - acc_diff); - vpx_memcpy(running_avg_y, running_buffer[r], 4); - vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4); - vpx_memcpy(running_avg_y + avg_y_stride * 2, - running_buffer[r] + 8, 4); - vpx_memcpy(running_avg_y + avg_y_stride * 3, - running_buffer[r] + 12, 4); - // Update pointers for next iteration. - running_avg_y += (avg_y_stride << 2); - } - sum_diff = sum_diff_16x1(acc_diff); - if (abs(sum_diff) > sum_diff_thresh) { - return COPY_BLOCK; - } - } else { - return COPY_BLOCK; - } + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) { + vpx_memcpy(sig_buffer[r], sig, width); + vpx_memcpy(sig_buffer[r] + width, sig + sig_stride, width); + vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, width); + vpx_memcpy(mc_running_buffer[r] + width, + mc_running_avg_y + mc_avg_y_stride, width); + vpx_memcpy(running_buffer[r], running_avg_y, width); + vpx_memcpy(running_buffer[r] + width, + running_avg_y + avg_y_stride, width); + if (width == 4) { + vpx_memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width); + vpx_memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width); + vpx_memcpy(mc_running_buffer[r] + width * 2, + mc_running_avg_y + mc_avg_y_stride * 2, width); + vpx_memcpy(mc_running_buffer[r] + width * 3, + mc_running_avg_y + mc_avg_y_stride * 3, width); + vpx_memcpy(running_buffer[r] + width * 2, + running_avg_y + avg_y_stride * 2, width); + vpx_memcpy(running_buffer[r] + width * 3, + running_avg_y + avg_y_stride * 3, width); } - } - return FILTER_BLOCK; -} - -static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride, - const uint8_t *mc_running_avg_y, - int mc_avg_y_stride, - uint8_t *running_avg_y, int avg_y_stride, - int increase_denoising, - BLOCK_SIZE bs, - int motion_magnitude) { - int sum_diff_thresh; - int r; - int shift_inc = (increase_denoising && - motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; - unsigned char sig_buffer[8][16], mc_running_buffer[8][16], - running_buffer[8][16]; - __m128i acc_diff = _mm_setzero_si128(); - const __m128i k_0 = _mm_setzero_si128(); - const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); - const __m128i k_8 = _mm_set1_epi8(8); - const __m128i k_16 = _mm_set1_epi8(16); - // Modify each level's adjustment according to motion_magnitude. - const __m128i l3 = _mm_set1_epi8( - (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? - 7 + shift_inc : 6); - // Difference between level 3 and level 2 is 2. - const __m128i l32 = _mm_set1_epi8(2); - // Difference between level 2 and level 1 is 1. - const __m128i l21 = _mm_set1_epi8(1); - int sum_diff = 0; - - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { - vpx_memcpy(sig_buffer[r], sig, 8); - vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8); - vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8); - vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y + - mc_avg_y_stride, 8); - vpx_memcpy(running_buffer[r], running_avg_y, 8); - vpx_memcpy(running_buffer[r] + 8, running_avg_y + - avg_y_stride, 8); acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r], running_buffer[r], &k_0, &k_4, &k_8, &k_16, &l3, &l32, &l21, acc_diff); - vpx_memcpy(running_avg_y, running_buffer[r], 8); - vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8); + vpx_memcpy(running_avg_y, running_buffer[r], width); + vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width); + if (width == 4) { + vpx_memcpy(running_avg_y + avg_y_stride * 2, + running_buffer[r] + width * 2, width); + vpx_memcpy(running_avg_y + avg_y_stride * 3, + running_buffer[r] + width * 3, width); + } // Update pointers for next iteration. - sig += (sig_stride << 1); - mc_running_avg_y += (mc_avg_y_stride << 1); - running_avg_y += (avg_y_stride << 1); + sig += (sig_stride << shift); + mc_running_avg_y += (mc_avg_y_stride << shift); + running_avg_y += (avg_y_stride << shift); } { @@ -294,54 +194,61 @@ static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride, sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), - // checK if we can still apply some (weaker) temporal filtering to + // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be - // within the accceptable range given by sum_diff_thresh. + // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. - int delta = ((abs(sum_diff) - sum_diff_thresh) - >> num_pels_log2_lookup[bs]) + 1; + const int delta = ((abs(sum_diff) - sum_diff_thresh) >> + num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]); - for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) { + for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) { acc_diff = vp9_denoiser_adj_16x1_sse2( - sig_buffer[r], mc_running_buffer[r], - running_buffer[r], k_0, k_delta, - acc_diff); - vpx_memcpy(running_avg_y, running_buffer[r], 8); - vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8); + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + k_0, k_delta, acc_diff); + vpx_memcpy(running_avg_y, running_buffer[r], width); + vpx_memcpy(running_avg_y + avg_y_stride, + running_buffer[r] + width, width); + if (width == 4) { + vpx_memcpy(running_avg_y + avg_y_stride * 2, + running_buffer[r] + width * 2, width); + vpx_memcpy(running_avg_y + avg_y_stride * 3, + running_buffer[r] + width * 3, width); + } // Update pointers for next iteration. - running_avg_y += (avg_y_stride << 1); + running_avg_y += (avg_y_stride << shift); } sum_diff = sum_diff_16x1(acc_diff); if (abs(sum_diff) > sum_diff_thresh) { return COPY_BLOCK; } } else { - return COPY_BLOCK; + return COPY_BLOCK; } } } return FILTER_BLOCK; } -static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, - const uint8_t *mc_running_avg_y, - int mc_avg_y_stride, - uint8_t *running_avg_y, - int avg_y_stride, - int increase_denoising, BLOCK_SIZE bs, - int motion_magnitude) { - int sum_diff_thresh; - int r, c; - int shift_inc = (increase_denoising && - motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; +// Denoiser for 16xM, 32xM and 64xM blocks +static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, + uint8_t *running_avg_y, + int avg_y_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + int sum_diff_thresh, r, c, sum_diff = 0; + const int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 1 : 0; __m128i acc_diff[4][4]; const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); @@ -349,13 +256,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, const __m128i k_16 = _mm_set1_epi8(16); // Modify each level's adjustment according to motion_magnitude. const __m128i l3 = _mm_set1_epi8( - (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? - 7 + shift_inc : 6); + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); // Difference between level 3 and level 2 is 2. const __m128i l32 = _mm_set1_epi8(2); // Difference between level 2 and level 1 is 1. const __m128i l21 = _mm_set1_epi8(1); - int sum_diff = 0; for (c = 0; c < 4; ++c) { for (r = 0; r < 4; ++r) { @@ -363,13 +268,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, } } - for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) { + for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2( - sig, mc_running_avg_y, - running_avg_y, - &k_0, &k_4, &k_8, &k_16, - &l3, &l32, &l21, acc_diff[c>>4][r>>4]); + sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, + &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; @@ -385,8 +288,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, // Update pointers for next iteration. sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride; mc_running_avg_y = mc_running_avg_y - - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + - mc_avg_y_stride; + 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + + mc_avg_y_stride; running_avg_y = running_avg_y - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + avg_y_stride; @@ -395,8 +298,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, { sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); if (abs(sum_diff) > sum_diff_thresh) { - int delta = ((abs(sum_diff) - sum_diff_thresh) - >> num_pels_log2_lookup[bs]) + 1; + const int delta = ((abs(sum_diff) - sum_diff_thresh) >> + num_pels_log2_lookup[bs]) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { @@ -408,9 +311,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride, for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) { acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2( - sig, mc_running_avg_y, - running_avg_y, k_0, - k_delta, acc_diff[c>>4][r>>4]); + sig, mc_running_avg_y, running_avg_y, k_0, + k_delta, acc_diff[c>>4][r>>4]); // Update pointers for next iteration. sig += 16; mc_running_avg_y += 16; @@ -449,25 +351,25 @@ int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride, BLOCK_SIZE bs, int motion_magnitude) { if (bs == BLOCK_4X4 || bs == BLOCK_4X8) { - return vp9_denoiser_4xM_sse2(sig, sig_stride, - mc_avg, mc_avg_stride, - avg, avg_stride, - increase_denoising, - bs, motion_magnitude); + return vp9_denoiser_NxM_sse2_small(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude, 4); } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) { - return vp9_denoiser_8xM_sse2(sig, sig_stride, - mc_avg, mc_avg_stride, - avg, avg_stride, - increase_denoising, - bs, motion_magnitude); + return vp9_denoiser_NxM_sse2_small(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude, 8); } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 || bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 || bs == BLOCK_64X32 || bs == BLOCK_64X64) { - return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride, - mc_avg, mc_avg_stride, - avg, avg_stride, - increase_denoising, - bs, motion_magnitude); + return vp9_denoiser_NxM_sse2_big(sig, sig_stride, + mc_avg, mc_avg_stride, + avg, avg_stride, + increase_denoising, + bs, motion_magnitude); } else { return COPY_BLOCK; } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index adae18b48..d3c2a138c 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -33,6 +33,8 @@ struct vp9_extracfg { vp8e_tuning tuning; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; + unsigned int rc_max_inter_bitrate_pct; + unsigned int gf_cbr_boost_pct; unsigned int lossless; unsigned int frame_parallel_decoding_mode; AQ_MODE aq_mode; @@ -54,6 +56,8 @@ static struct vp9_extracfg default_extra_cfg = { VP8_TUNE_PSNR, // tuning 10, // cq_level 0, // rc_max_intra_bitrate_pct + 0, // rc_max_inter_bitrate_pct + 0, // gf_cbr_boost_pct 0, // lossless 0, // frame_parallel_decoding_mode NO_AQ, // aq_mode @@ -380,6 +384,8 @@ static vpx_codec_err_t set_encoder_config( // Convert target bandwidth from Kbit/s to Bit/s oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; + oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; + oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; oxcf->best_allowed_q = extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer); @@ -649,6 +655,22 @@ static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_inter_bitrate_pct = + CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_cbr_boost_pct = + CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -1266,6 +1288,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8E_SET_TUNING, ctrl_set_tuning}, {VP8E_SET_CQ_LEVEL, ctrl_set_cq_level}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct}, + {VP8E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct}, + {VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct}, {VP9E_SET_LOSSLESS, ctrl_set_lossless}, {VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode}, {VP9E_SET_AQ_MODE, ctrl_set_aq_mode}, |