diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_filter.c | 18 | ||||
-rw-r--r-- | vp9/common/vp9_filter.h | 3 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 2 | ||||
-rw-r--r-- | vp9/common/vp9_thread_common.c | 157 | ||||
-rw-r--r-- | vp9/common/vp9_thread_common.h | 19 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 100 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 391 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 55 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 98 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.h | 27 | ||||
-rw-r--r-- | vp9/encoder/vp9_mbgraph.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 223 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_partition_models.h | 306 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 28 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 20 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 29 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 5 | ||||
-rw-r--r-- | vp9/vp9_dx_iface.c | 11 | ||||
-rw-r--r-- | vp9/vp9_dx_iface.h | 1 |
25 files changed, 1347 insertions, 181 deletions
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 6c43af8ce..cadae6f2e 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel, { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 } }; -const InterpKernel *vp9_filter_kernels[4] = { - sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters +// 4-tap filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -3, 125, 8, -2, 0, 0 }, + { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 115, 27, -6, 0, 0 }, + { 0, 0, -10, 108, 37, -7, 0, 0 }, { 0, 0, -11, 101, 47, -9, 0, 0 }, + { 0, 0, -11, 93, 56, -10, 0, 0 }, { 0, 0, -12, 85, 66, -11, 0, 0 }, + { 0, 0, -11, 75, 75, -11, 0, 0 }, { 0, 0, -11, 66, 85, -12, 0, 0 }, + { 0, 0, -10, 56, 93, -11, 0, 0 }, { 0, 0, -9, 47, 101, -11, 0, 0 }, + { 0, 0, -7, 37, 108, -10, 0, 0 }, { 0, 0, -6, 27, 115, -8, 0, 0 }, + { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 125, -3, 0, 0 } +}; + +const InterpKernel *vp9_filter_kernels[5] = { + sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters, + sub_pel_filters_4 }; diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index b379665b1..0382c88e7 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -25,6 +25,7 @@ extern "C" { #define EIGHTTAP_SHARP 2 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ #define BILINEAR 3 +#define FOURTAP 4 // The codec can operate in four possible inter prediction filter mode: // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) @@ -32,7 +33,7 @@ extern "C" { typedef uint8_t INTERP_FILTER; -extern const InterpKernel *vp9_filter_kernels[4]; +extern const InterpKernel *vp9_filter_kernels[5]; #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 45d3b0f82..c5c63e476 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -259,6 +259,8 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index d4b076645..36530fae6 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -229,6 +229,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -266,6 +288,25 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, pthread_cond_init(&lf_sync->cond[i], NULL); } } + pthread_mutex_init(&lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + int i; + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + int i; + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); + } + } } #endif // CONFIG_MULTITHREAD @@ -276,6 +317,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } @@ -298,15 +344,126 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { } vpx_free(lf_sync->cond); } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + pthread_mutex_destroy(&lf_sync->lf_mutex); + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); + } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. vp9_zero(*lf_sync); } } +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + int cur_row; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(&lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(&lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(&lf_sync->lf_mutex); + if (lf_sync->corrupted) { + return_val = -1; + } + pthread_mutex_unlock(&lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h index f92df5bd6..b97e9ee13 100644 --- a/vp9/common/vp9_thread_common.h +++ b/vp9/common/vp9_thread_common.h @@ -37,6 +37,14 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +61,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 48c49e2f5..95e376d04 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1451,6 +1451,25 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, return vpx_reader_find_end(&tile_data->bit_reader); } +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. @@ -1461,6 +1480,12 @@ static int tile_worker_hook(void *arg1, void *arg2) { TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; + VP9_COMMON *cm = &pbi->common; + + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; volatile int n = tile_data->buf_start; tile_data->error_info.setjmp = 1; @@ -1468,14 +1493,26 @@ static int tile_worker_hook(void *arg1, void *arg2) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } return 0; } tile_data->xd.corrupted = 0; do { - int mi_row, mi_col; + int mi_col; const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; vp9_zero(tile_data->dqcoeff); vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, tile_data->data_end, buf->size, @@ -1493,6 +1530,14 @@ static int tile_worker_hook(void *arg1, void *arg2) { mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } } if (buf->col == final_col) { @@ -1500,6 +1545,21 @@ static int tile_worker_hook(void *arg1, void *arg2) { } } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + tile_data->data_end = bit_reader_end; return !tile_data->xd.corrupted; } @@ -1516,6 +1576,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, VP9_COMMON *const cm = &pbi->common; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -1542,12 +1604,26 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, } } + // Initialize LPF + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; TileWorkerData *const tile_data = &pbi->tile_worker_data[n + pbi->total_tiles]; winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; @@ -2069,17 +2145,19 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { // Multi-threaded tile decoder *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, + cm->lf.filter_level, 0, 0, pbi->tile_workers, + pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); } - } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 1c488961a..425c8964c 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -37,6 +37,8 @@ typedef struct TileWorkerData { int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive vpx_reader bit_reader; FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); @@ -74,6 +76,7 @@ typedef struct VP9Decoder { int hold_ref_buf; // hold the reference buffer. int row_mt; + int lpf_mt_opt; } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 6ea264f09..563fdbbde 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -208,6 +208,11 @@ struct macroblock { void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif +#if CONFIG_ML_VAR_PARTITION + DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]); +#endif // CONFIG_ML_VAR_PARTITION + + struct scale_factors *me_sf; }; #ifdef __cplusplus diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f1527f930..72dc13797 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3393,6 +3393,139 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, #undef FEATURES #undef LABELS +// Use a neural net model to prune partition-none and partition-split search. +// The model uses prediction residue variance and quantization step size as +// input features. +#define FEATURES 6 +static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const NN_CONFIG *nn_config = NULL; + DECLARE_ALIGNED(16, uint8_t, pred_buf[64 * 64]); + const int speed = cpi->oxcf.speed; + int i; + float thresh = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_var_rd_part_nnconfig_64; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_var_rd_part_nnconfig_32; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_var_rd_part_nnconfig_16; + thresh = speed > 0 ? 3.5f : 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_var_rd_part_nnconfig_8; + if (cm->width >= 720 && cm->height >= 720) + thresh = speed > 0 ? 2.5f : 2.0f; + else + thresh = speed > 0 ? 3.5f : 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + mi->ref_frame[1] = NONE; + mi->sb_type = bsize; + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref); + MV ref_mv = { 0, 0 }; + MV ref_mv_full = { 0, 0 }; + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV best_mv = { 0, 0 }; + int cost_list[5]; + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ref - 1].sf); + mi->ref_frame[0] = ref; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh) *none = 0; + if (score < -thresh) *split = 0; + } +} +#undef FEATURES +#undef LABELS + int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult) { TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; @@ -3624,6 +3757,21 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->partitioning = PARTITION_NONE; + if (cpi->sf.ml_var_partition_pruning) { + int do_ml_var_partition_pruning = + !frame_is_intra_only(cm) && partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + do_ml_var_partition_pruning = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + if (do_ml_var_partition_pruning) { + ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + } + } + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, @@ -3738,6 +3886,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; } // store estimated motion vector @@ -4345,6 +4496,83 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { } } +#if CONFIG_ML_VAR_PARTITION +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + float thresh_low = -0.2f; + float thresh_high = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_var_part_nnconfig_64; + thresh_low = -0.3f; + thresh_high = -0.1f; + break; + case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break; + case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break; + case BLOCK_8X8: break; + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 8 * (mi_row & 7); + const int sb_offset_col = 8 * (mi_col & 7); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + + assert(feature_idx == FEATURES); + nn_predict(features, nn_config, score); + if (score[0] > thresh_high) return 3; + if (score[0] < thresh_low) return 0; + return -1; + } +} +#undef FEATURES +#undef LABELS +#endif // CONFIG_ML_VAR_PARTITION + static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -4374,6 +4602,11 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; +#if CONFIG_ML_VAR_PARTITION + const int use_ml_based_partitioning = + sf->partition_search_type == ML_BASED_PARTITION; +#endif // CONFIG_ML_VAR_PARTITION + (void)*tp_orig; // Avoid checking for rectangular partitions for speed >= 6. @@ -4404,6 +4637,20 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, partition_vert_allowed &= force_vert_split; } +#if CONFIG_ML_VAR_PARTITION + if (use_ml_based_partitioning) { + if (partition_none_allowed || do_split) do_rect = 0; + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == 0) do_split = 0; + if (ml_predicted_partition == 3) partition_none_allowed = 0; + } + } +#endif // CONFIG_ML_VAR_PARTITION + + if (!partition_none_allowed && !do_split) do_rect = 1; + ctx->pred_pixel_ready = !(partition_vert_allowed || partition_horz_allowed || do_split); @@ -4417,26 +4664,28 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, ctx->skip = x->skip; if (this_rdc.rate != INT_MAX) { - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { - int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; - int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; - - dist_breakout_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - rate_breakout_thr *= num_pels_log2_lookup[bsize]; - best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && - this_rdc.dist < dist_breakout_thr) { - do_split = 0; - do_rect = 0; +#if CONFIG_ML_VAR_PARTITION + if (!use_ml_based_partitioning) +#endif // CONFIG_ML_VAR_PARTITION + { + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { + do_split = 0; + do_rect = 0; + } } } } @@ -4835,6 +5084,111 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#if CONFIG_ML_VAR_PARTITION +// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock. +static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + int pixels_wide = 64, pixels_high = 64; + unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad = UINT_MAX; + + assert(yv12 != NULL); + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NONE; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + y_sad = y_sad_g; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} +#endif // CONFIG_ML_VAR_PARTITION + static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TOKENEXTRA **tp) { @@ -4926,6 +5280,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; +#if CONFIG_ML_VAR_PARTITION + case ML_BASED_PARTITION: + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + x->sb_pickmode_part = 1; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); + break; +#endif // CONFIG_ML_VAR_PARTITION case SOURCE_VAR_BASED_PARTITION: set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col); nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index c6e9fc840..c10d010a4 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2449,6 +2449,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_loop_filter_init(cm); + // Set up the unit scaling factor used during motion search. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.me_sf = &cpi->me_sf; + cm->error.setjmp = 0; return cpi; @@ -3058,6 +3069,12 @@ void update_ref_frames(VP9_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } + + if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) { + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } } void vp9_update_reference_frames(VP9_COMP *cpi) { @@ -3211,8 +3228,8 @@ void vp9_scale_references(VP9_COMP *cpi) { if (cpi->oxcf.pass == 0 && !cpi->use_svc) { // Check for release of scaled reference. buf_idx = cpi->scaled_ref_idx[ref_frame - 1]; - buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL; - if (buf != NULL) { + if (buf_idx != INVALID_IDX) { + buf = &pool->frame_bufs[buf_idx]; --buf->ref_count; cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; } @@ -3243,22 +3260,21 @@ static void release_scaled_references(VP9_COMP *cpi) { refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i - 1]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); - if (buf != NULL && - (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && - buf->buf.y_crop_height == ref->y_crop_height))) { - --buf->ref_count; - cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height)) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } } } } else { for (i = 0; i < MAX_REF_FRAMES; ++i) { const int idx = cpi->scaled_ref_idx[i]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - if (buf != NULL) { + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; --buf->ref_count; cpi->scaled_ref_idx[i] = INVALID_IDX; } @@ -4904,6 +4920,8 @@ static void init_ref_frame_bufs(VP9_COMMON *cm) { cm->new_fb_idx = INVALID_IDX; for (i = 0; i < REF_FRAMES; ++i) { cm->ref_frame_map[i] = INVALID_IDX; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { pool->frame_bufs[i].ref_count = 0; } } @@ -5539,12 +5557,13 @@ uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, /* restore UMV window */ x->mv_limits = tmp_mv_limits; + // TODO(yunqing): may use higher tap interp filter than 2 taps. // Ignore mv costing by sending NULL pointer instead of cost array bestsme = cpi->find_fractional_mv_step( x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); return bestsme; } @@ -6498,11 +6517,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->summedp_quality += frame_ssim2 * weight; cpi->summedp_weights += weight; #if 0 - { + if (cm->show_frame) { FILE *f = fopen("q_used.stt", "a"); fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, - frame_psnr2, frame_ssim2); + cpi->common.current_video_frame, psnr2.psnr[1], + psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2); fclose(f); } #endif diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 25396bc7a..a9f7daf07 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -605,6 +605,7 @@ typedef struct VP9_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; + struct scale_factors me_sf; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 38e98cd1e..58c3a435d 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2085,18 +2085,6 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -// Current limit on maximum number of active arfs in a GF/ARF group. -#define MAX_ACTIVE_ARFS 2 -#define ARF_SLOT1 2 -#define ARF_SLOT2 3 -// This function indirects the choice of buffers for arfs. -// At the moment the values are fixed but this may change as part of -// the integration process with other codec features that swap buffers around. -static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { - arf_buffer_indices[0] = ARF_SLOT1; - arf_buffer_indices[1] = ARF_SLOT2; -} - // Used in corpus vbr: Calculates the total normalized group complexity score // for a given number of frames starting at the current position in the stats // file. @@ -2172,6 +2160,20 @@ static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group, find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end); } +static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group, + int frame_index, + int source_alt_ref_active) { + if (source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 0; + } +} + static int define_gf_group_structure(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; @@ -2179,7 +2181,6 @@ static int define_gf_group_structure(VP9_COMP *cpi) { int i; int frame_index = 0; int key_frame; - unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; int normal_frames; key_frame = cpi->common.frame_type == KEY_FRAME; @@ -2187,24 +2188,11 @@ static int define_gf_group_structure(VP9_COMP *cpi) { gf_group->frame_start = cpi->common.current_video_frame; gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval - 1; - get_arf_buffer_indices(arf_buffer_indices); - // For key frames the frame target rate is already set and it // is also the golden frame. // === [frame_index == 0] === - if (!key_frame) { - if (rc->source_alt_ref_active) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - gf_group->layer_depth[frame_index] = 0; - } - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - } + if (!key_frame) + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active); ++frame_index; @@ -2213,12 +2201,8 @@ static int define_gf_group_structure(VP9_COMP *cpi) { gf_group->update_type[frame_index] = ARF_UPDATE; gf_group->rf_level[frame_index] = GF_ARF_STD; gf_group->layer_depth[frame_index] = 1; - gf_group->arf_src_offset[frame_index] = (unsigned char)(rc->baseline_gf_interval - 1); - - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; ++frame_index; } @@ -2226,13 +2210,9 @@ static int define_gf_group_structure(VP9_COMP *cpi) { find_arf_order(cpi, gf_group, &frame_index, 2, 0, rc->baseline_gf_interval - 1); - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } + set_gf_overlay_frame_type(gf_group, frame_index, + rc->source_alt_ref_pending); + gf_group->arf_src_offset[frame_index] = 0; return frame_index; @@ -2242,12 +2222,8 @@ static int define_gf_group_structure(VP9_COMP *cpi) { rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); for (i = 0; i < normal_frames; ++i) { - int arf_idx = 0; if (twopass->stats_in >= twopass->stats_in_end) break; - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->update_type[frame_index] = LF_UPDATE; gf_group->rf_level[frame_index] = INTER_NORMAL; gf_group->arf_src_offset[frame_index] = 0; @@ -2260,8 +2236,8 @@ static int define_gf_group_structure(VP9_COMP *cpi) { // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to // vp9_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; + + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending); if (rc->source_alt_ref_pending) { gf_group->update_type[frame_index] = OVERLAY_UPDATE; @@ -2383,9 +2359,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, } gf_group->bit_allocation[idx] = 0; - for (idx = 0; idx < gop_frames; ++idx) - if (gf_group->update_type[idx] == LF_UPDATE) break; - return; } @@ -2485,6 +2458,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const int is_key_frame = frame_is_intra_only(cm); const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + double gop_intra_factor = 1.0; + // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (is_key_frame == 0) { @@ -2524,8 +2499,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { { int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, cpi->common.bit_depth)); - int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, - cpi->common.bit_depth)); + int q_term = (cm->current_video_frame == 0) + ? int_max_q / 32 + : (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, + cpi->common.bit_depth) / + 6); active_min_gf_interval = rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); active_min_gf_interval = @@ -2535,7 +2513,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // bits to spare and are better with a smaller interval and smaller boost. // At high Q when there are few bits to spare we are better with a longer // interval to spread the cost of the GF. - active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); + active_max_gf_interval = 11 + arf_active_or_kf + VPXMIN(5, q_term); + + // Force max GF interval to be odd. + active_max_gf_interval = active_max_gf_interval | 0x01; // We have: active_min_gf_interval <= // rc->max_gf_interval + arf_active_or_kf. @@ -2552,6 +2533,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { active_max_gf_interval = rc->frames_to_key / 2; } + if (cpi->multi_layer_arf) { + int layers = 0; + int max_layers = VPXMIN(MAX_ARF_LAYERS, cpi->oxcf.enable_auto_arf); + + // Adapt the intra_error factor to active_max_gf_interval limit. + for (i = active_max_gf_interval; i > 0; i >>= 1) ++layers; + + layers = VPXMIN(max_layers, layers); + gop_intra_factor += (layers * 0.25); + } + i = 0; while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { ++i; @@ -2624,11 +2616,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Don't break out with a very short interval. (i >= active_min_gf_interval) && // If possible dont break very close to a kf - ((rc->frames_to_key - i) >= rc->min_gf_interval) && + ((rc->frames_to_key - i) >= rc->min_gf_interval) && (i & 0x01) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || - (sr_accumulator > next_frame.intra_error)))) { + (sr_accumulator > gop_intra_factor * next_frame.intra_error)))) { break; } diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index da1c61c49..9bd0a9e04 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -43,12 +43,6 @@ typedef struct { #define INVALID_ROW -1 -// Length of the bi-predictive frame group (BFG) -// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain -// number of bi-predictive frames. -#define BFG_INTERVAL 2 -#define MAX_EXT_ARFS 2 -#define MIN_EXT_ARF_INTERVAL 4 #define MAX_ARF_LAYERS 6 typedef struct { @@ -117,8 +111,9 @@ typedef enum { GF_UPDATE = 2, ARF_UPDATE = 3, OVERLAY_UPDATE = 4, - USE_BUF_FRAME = 5, // Use show existing frame, no ref buffer update - FRAME_UPDATE_TYPES = 6 + MID_OVERLAY_UPDATE = 5, + USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update + FRAME_UPDATE_TYPES = 7 } FRAME_UPDATE_TYPE; #define FC_ANIMATION_THRESH 0.15 @@ -134,10 +129,6 @@ typedef struct { FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2]; unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2]; - unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 2]; - unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 2]; - unsigned char brf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; - unsigned char bidir_pred_enabled[MAX_STATIC_GF_GROUP_LENGTH + 2]; int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2]; int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2]; @@ -203,7 +194,6 @@ struct ThreadData; struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); -void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); @@ -222,17 +212,6 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi); void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, int *scaled_frame_height); -static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { - assert(MAX_EXT_ARFS > 0); - if (arf_pending) { - if (interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)) - return MAX_EXT_ARFS; - else if (interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS) - return MAX_EXT_ARFS - 1; - } - return 0; -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 2ec048b53..831c79c17 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -57,11 +57,12 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, { uint32_t distortion; uint32_t sse; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0, mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + 0, USE_2_TAPS); } xd->mi[0]->mode = NEWMV; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 995c54fc7..0f9051bb7 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -367,14 +367,12 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { *ir = (int)divide_and_round(x1 * b, y1); } -uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, - const MV *ref_mv, int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], - uint32_t *distortion, uint32_t *sse1, - const uint8_t *second_pred, int w, int h) { +uint32_t vp9_skip_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -397,6 +395,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, (void)sse; (void)thismse; (void)cost_list; + (void)use_accurate_subpel_search; return besterr; } @@ -406,7 +405,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -418,6 +417,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( (void)allow_hp; (void)forced_stop; (void)hstep; + (void)use_accurate_subpel_search; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && @@ -471,8 +471,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -531,8 +533,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -617,12 +621,119 @@ static const MV search_step_table[12] = { }; /* clang-format on */ +static int accurate_sub_pel_search( + const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf, + const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src_address, const int src_stride, + const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred, + int w, int h, uint32_t *sse) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride, + pred16, w, this_mv, sf, w, h, 0, kernel, + MV_PRECISION_Q3, 0, 0, xd->bd); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, pred16, w); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address, + src_stride, sse); + } else { + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse); + } + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + } + if (besterr >= UINT_MAX) return UINT_MAX; + return (int)besterr; +#else + int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + (void)xd; + + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// TODO(yunqing): this part can be further refactored. +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#endif + uint32_t vp9_find_best_sub_pixel_tree( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { const uint8_t *const z = x->plane[0].src.buf; const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; @@ -650,6 +761,14 @@ uint32_t vp9_find_best_sub_pixel_tree( int kr, kc; MvLimits subpel_mv_limits; + // TODO(yunqing): need to add 4-tap filter optimization to speed up the + // encoder. + const InterpKernel *kernel = (use_accurate_subpel_search > 0) + ? ((use_accurate_subpel_search == USE_4_TAPS) + ? vp9_filter_kernels[FOURTAP] + : vp9_filter_kernels[EIGHTTAP]) + : vp9_filter_kernels[BILINEAR]; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); minc = subpel_mv_limits.col_min; maxc = subpel_mv_limits.col_max; @@ -674,16 +793,25 @@ uint32_t vp9_find_best_sub_pixel_tree( tr = br + search_step[idx].row; tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv; this_mv.row = tr; this_mv.col = tc; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -705,14 +833,21 @@ uint32_t vp9_find_best_sub_pixel_tree( tc = bc + kc; tr = br + kr; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv = { tr, tc }; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse, second_pred); + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, y_stride, + second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -743,20 +878,36 @@ uint32_t vp9_find_best_sub_pixel_tree( if (tr == br && tc != bc) { kc = bc - tc; if (iters_per_step == 1) { - CHECK_BETTER(second, br0, bc0 + kc); + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0, bc0 + kc); + } else { + CHECK_BETTER(second, br0, bc0 + kc); + } } } else if (tr != br && tc == bc) { kr = br - tr; if (iters_per_step == 1) { - CHECK_BETTER(second, br0 + kr, bc0); + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + } else { + CHECK_BETTER(second, br0 + kr, bc0); + } } } if (iters_per_step > 1) { - CHECK_BETTER(second, br0 + kr, bc0); - CHECK_BETTER(second, br0, bc0 + kc); - if (br0 != br || bc0 != bc) { - CHECK_BETTER(second, br0 + kr, bc0 + kc); + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + CHECK_BETTER1(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER1(second, br0 + kr, bc0 + kc); + } + } else { + CHECK_BETTER(second, br0 + kr, bc0); + CHECK_BETTER(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER(second, br0 + kr, bc0 + kc); + } } } } @@ -781,6 +932,7 @@ uint32_t vp9_find_best_sub_pixel_tree( } #undef CHECK_BETTER +#undef CHECK_BETTER1 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, int range) { @@ -2587,7 +2739,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, (void)tc; \ (void)sse; \ (void)thismse; \ - (void)cost_list; + (void)cost_list; \ + (void)use_accurate_subpel_search; // Return the maximum MV. uint32_t vp9_return_max_sub_pixel_mv( @@ -2595,7 +2748,7 @@ uint32_t vp9_return_max_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)minr; @@ -2617,7 +2770,7 @@ uint32_t vp9_return_min_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)maxr; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index adb02bc1a..6bd85a152 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -75,7 +75,7 @@ typedef uint32_t(fractional_mv_step_fp)( int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h); + int h, int use_accurate_subpel_search); extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; diff --git a/vp9/encoder/vp9_partition_models.h b/vp9/encoder/vp9_partition_models.h index 19979e531..904d21400 100644 --- a/vp9/encoder/vp9_partition_models.h +++ b/vp9/encoder/vp9_partition_models.h @@ -18,7 +18,9 @@ extern "C" { #define NN_MAX_HIDDEN_LAYERS 10 #define NN_MAX_NODES_PER_LAYER 128 -// Neural net model config. +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. typedef struct { int num_inputs; // Number of input nodes, i.e. features. int num_outputs; // Number of output nodes. @@ -834,6 +836,308 @@ static const NN_CONFIG vp9_partition_nnconfig_16x16 = { }; #undef FEATURES +#if CONFIG_ML_VAR_PARTITION +#define FEATURES 6 +static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f, + -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f, + -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f, + -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f, + 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f, + -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f, + -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f, + -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f, +}; + +static const float vp9_var_part_nn_bias_64_layer0[8] = { + -0.044396f, -0.938166f, 0.000000f, -0.916375f, + 1.242299f, 0.000000f, -0.405734f, 0.014206f, +}; + +static const float vp9_var_part_nn_weights_64_layer1[8] = { + 1.635945f, 0.979557f, 0.455315f, 1.197199f, + -2.251024f, -0.464953f, 1.378676f, -0.111927f, +}; + +static const float vp9_var_part_nn_bias_64_layer1[1] = { + -0.37972447f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_64_layer0, + vp9_var_part_nn_weights_64_layer1, + }, + { + vp9_var_part_nn_bias_64_layer0, + vp9_var_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f, + 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f, + -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f, + 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f, + -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f, + 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f, + -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f, + 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f, +}; + +static const float vp9_var_part_nn_bias_32_layer0[8] = { + 0.368242f, 0.736617f, 0.000000f, 0.757287f, + 0.000000f, 0.613248f, -0.776390f, 0.928497f, +}; + +static const float vp9_var_part_nn_weights_32_layer1[8] = { + 0.939884f, -2.420850f, -0.410489f, -0.186690f, + 0.063287f, -0.522011f, 0.484527f, -0.639625f, +}; + +static const float vp9_var_part_nn_bias_32_layer1[1] = { + -0.6455006f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_32_layer0, + vp9_var_part_nn_weights_32_layer1, + }, + { + vp9_var_part_nn_bias_32_layer0, + vp9_var_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f, + -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f, + -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f, + 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f, + 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f, + -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f, + -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f, + 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f, +}; + +static const float vp9_var_part_nn_bias_16_layer0[8] = { + 2.901234f, -1.940932f, -0.198970f, -0.406524f, + 0.059422f, -1.879207f, -0.232340f, 2.979821f, +}; + +static const float vp9_var_part_nn_weights_16_layer1[8] = { + -0.528731f, 0.375234f, -0.088422f, 0.668629f, + 0.870449f, 0.578735f, 0.546103f, -1.957207f, +}; + +static const float vp9_var_part_nn_bias_16_layer1[1] = { + -1.95769405f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_16_layer0, + vp9_var_part_nn_weights_16_layer1, + }, + { + vp9_var_part_nn_bias_16_layer0, + vp9_var_part_nn_bias_16_layer1, + }, +}; +#undef FEATURES +#endif // CONFIG_ML_VAR_PARTITION + +#define FEATURES 6 +#define LABELS 1 +static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.100129f, 0.128867f, -1.375086f, -2.268096f, -1.470368f, -2.296274f, + 0.034445f, -0.062993f, -2.151904f, 0.523215f, 1.611269f, 1.530051f, + 0.418182f, -1.330239f, 0.828388f, 0.386546f, -0.026188f, -0.055459f, + -0.474437f, 0.861295f, -2.208743f, -0.652991f, -2.985873f, -1.728956f, + 0.388052f, -0.420720f, 2.015495f, 1.280342f, 3.040914f, 1.760749f, + -0.009062f, 0.009623f, 1.579270f, -2.012891f, 1.629662f, -1.796016f, + -0.279782f, -0.288359f, 1.875618f, 1.639855f, 0.903020f, 0.906438f, + 0.553394f, -1.621589f, 0.185063f, 0.605207f, -0.133560f, 0.588689f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer0[8] = { + 0.659717f, 0.120912f, 0.329894f, -1.586385f, + 1.715839f, 0.085754f, 2.038774f, 0.268119f, +}; + +static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = { + -3.445586f, 2.375620f, 1.236970f, 0.804030f, + -2.448384f, 2.827254f, 2.291478f, 0.790252f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = { + -1.16608453f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_64_layer0, + vp9_var_rd_part_nn_weights_64_layer1, + }, + { + vp9_var_rd_part_nn_bias_64_layer0, + vp9_var_rd_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.022420f, -0.032201f, 1.228065f, -2.767655f, 1.928743f, 0.566863f, + 0.459229f, 0.422048f, 0.833395f, 0.822960f, -0.232227f, 0.586895f, + 0.442856f, -0.018564f, 0.227672f, -1.291306f, 0.119428f, -0.776563f, + -0.042947f, 0.183129f, 0.592231f, 1.174859f, -0.503868f, 0.270102f, + -0.330537f, -0.036340f, 1.144630f, 1.783710f, 1.216929f, 2.038085f, + 0.373782f, -0.430258f, 1.957002f, 1.383908f, 2.012261f, 1.585693f, + -0.394399f, -0.337523f, -0.238335f, 0.007819f, -0.368294f, 0.437875f, + -0.318923f, -0.242000f, 2.276263f, 1.501432f, 0.645706f, 0.344774f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer0[8] = { + -0.023846f, -1.348117f, 1.365007f, -1.644164f, + 0.062992f, 1.257980f, -0.098642f, 1.388472f, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = { + 3.016729f, 0.622684f, -1.021302f, 1.490383f, + 1.702046f, -2.964618f, 0.689045f, 1.711754f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = { + -1.28798676f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_32_layer0, + vp9_var_rd_part_nn_weights_32_layer1, + }, + { + vp9_var_rd_part_nn_bias_32_layer0, + vp9_var_rd_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = { + -0.726813f, -0.026748f, 1.376946f, 1.467961f, 1.961810f, 1.690412f, + 0.596484f, -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f, + 0.040520f, -0.032391f, -1.194214f, 2.438063f, -3.915334f, 1.997270f, + 0.673696f, -0.676393f, 1.654886f, 1.553838f, 1.129691f, 1.360201f, + 0.255001f, 0.336442f, -0.487759f, -0.634555f, 0.479170f, -0.110475f, + -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f, + -0.262463f, 0.228079f, -1.688776f, -1.594502f, -2.261078f, -1.802535f, + 0.034748f, -0.028476f, 2.713258f, 0.212446f, -1.529202f, -2.560178f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer0[8] = { + 0.495983f, 1.858545f, 0.162974f, 1.992247f, + -2.698863f, 0.110020f, 0.550830f, 0.420941f, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = { + 1.768409f, -1.394240f, 1.076846f, -1.762808f, + 1.517405f, 0.535195f, -0.426827f, 1.002272f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = { + -1.65894794f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_16_layer0, + vp9_var_rd_part_nn_weights_16_layer1, + }, + { + vp9_var_rd_part_nn_bias_16_layer0, + vp9_var_rd_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = { + -0.804900f, -1.214983f, 0.840202f, 0.686566f, 0.155804f, 0.025542f, + -1.244635f, -0.368403f, 0.364150f, 1.081073f, 0.552387f, 0.452715f, + 0.652968f, -0.293058f, 0.048967f, 0.021240f, -0.662981f, 0.424700f, + 0.008293f, -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f, + -0.239557f, -0.143766f, 0.064311f, 1.320998f, -0.477411f, 0.026374f, + 0.730884f, -0.675124f, 0.965521f, 0.863658f, 0.809186f, 0.812280f, + 0.513131f, 0.185102f, 0.211354f, 0.793666f, 0.121714f, -0.015383f, + -0.650980f, -0.046581f, 0.911141f, 0.806319f, 0.974773f, 0.815893f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer0[8] = { + 0.176134f, 0.651308f, 2.007761f, 0.068812f, + 1.061517f, 1.487161f, -2.308147f, 1.099828f, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = { + 0.683032f, 1.326393f, -1.661539f, 1.438920f, + 1.118023f, -2.237380f, 1.518468f, 2.010416f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = { + -1.65423989f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_8_layer0, + vp9_var_rd_part_nn_weights_8_layer1, + }, + { + vp9_var_rd_part_nn_bias_8_layer0, + vp9_var_rd_part_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef LABELS + // Partition pruning model(linear). static const float vp9_partition_feature_mean[24] = { 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 416d437e0..8dce4cf7b 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -247,7 +247,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } @@ -1539,7 +1540,8 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0); + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); } else if (svc->use_base_mv && svc->spatial_layer_id) { if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; @@ -2758,7 +2760,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dummy_dist, - &x->pred_sse[ref_frame], NULL, 0, 0); + &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv; } else { diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 86d2fa18c..76e310ac2 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -777,13 +777,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, +static int get_gf_active_quality(const VP9_COMP *const cpi, int q, vpx_bit_depth_t bit_depth) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_CONTROL *const rc = &cpi->rc; + int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : rc->gfu_boost; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); - return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } @@ -935,7 +941,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -1082,7 +1088,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -1097,7 +1103,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -1278,7 +1284,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -1287,7 +1293,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Modify best quality for second level arfs. For mode VPX_Q this // becomes the baseline frame q. @@ -1295,7 +1301,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, active_best_quality = (active_best_quality + cq_level + 1) / 2; } } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -1445,6 +1451,12 @@ void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { cpi->refresh_alt_ref_frame = 0; cpi->rc.is_src_frame_alt_ref = 1; break; + case MID_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; case USE_BUF_FRAME: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 698faa343..9cde479cd 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1821,7 +1821,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, pw, ph); + &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search); } // Restore the pointer to the first (possibly scaled) prediction buffer. @@ -1875,6 +1875,8 @@ static int64_t rd_pick_best_sub8x8_mode( const BLOCK_SIZE bsize = mi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int pw = num_4x4_blocks_wide << 2; + const int ph = num_4x4_blocks_high << 2; ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; SPEED_FEATURES *const sf = &cpi->sf; @@ -2011,7 +2013,8 @@ static int64_t rd_pick_best_sub8x8_mode( x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop, sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &distortion, - &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0); + &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; @@ -2330,6 +2333,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int best_predmv_idx = x->mv_best_ref_index[ref]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; MV pred_mv[3]; pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; @@ -2452,7 +2457,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index b54587931..87b417a4b 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -219,6 +219,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->less_rectangular_check = 1; sf->use_square_partition_only = !boosted; sf->prune_ref_frame_for_rect_partitions = 1; + sf->ml_var_partition_pruning = 1; sf->ml_prune_rect_partition_threhold[0] = -1; sf->ml_prune_rect_partition_threhold[1] = 350; @@ -241,6 +242,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, if (speed >= 1) { sf->enable_tpl_model = 0; + sf->ml_var_partition_pruning = !boosted; sf->ml_prune_rect_partition_threhold[1] = 200; sf->ml_prune_rect_partition_threhold[2] = 200; sf->ml_prune_rect_partition_threhold[3] = 200; @@ -286,9 +288,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) : INT_MAX; + sf->use_accurate_subpel_search = USE_4_TAPS; } if (speed >= 2) { + sf->ml_var_partition_pruning = 0; if (oxcf->vbr_corpus_complexity) sf->recode_loop = ALLOW_RECODE_FIRST; else @@ -326,6 +330,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, good_quality_mesh_patterns[mesh_density_level][i].interval; } } + + sf->use_accurate_subpel_search = USE_2_TAPS; } if (speed >= 3) { @@ -448,6 +454,7 @@ static void set_rt_speed_feature_framesize_independent( sf->disable_golden_ref = 0; sf->enable_tpl_model = 0; sf->enhanced_full_pixel_motion_search = 0; + sf->use_accurate_subpel_search = USE_2_TAPS; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -623,7 +630,18 @@ static void set_rt_speed_feature_framesize_independent( sf->use_altref_onepass = 1; sf->use_compound_nonrd_pickmode = 1; } +#if CONFIG_ML_VAR_PARTITION + if (!frame_is_intra_only(cm) && cm->width >= 360 && cm->height >= 360) + sf->partition_search_type = ML_BASED_PARTITION; + else + sf->partition_search_type = VAR_BASED_PARTITION; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->partition_search_type = VAR_BASED_PARTITION; +#endif // CONFIG_VP9_HIGHBITDEPTH +#else sf->partition_search_type = VAR_BASED_PARTITION; +#endif // CONFIG_ML_VAR_PARTITION sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; @@ -928,6 +946,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->ml_prune_rect_partition_threhold[1] = -1; sf->ml_prune_rect_partition_threhold[2] = -1; sf->ml_prune_rect_partition_threhold[3] = -1; + sf->ml_var_partition_pruning = 0; + sf->use_accurate_subpel_search = USE_8_TAPS; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 531df704c..0067bb4ac 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -136,20 +136,25 @@ typedef enum { } INTERP_FILTER_MASK; typedef enum { - // Search partitions using RD/NONRD criterion + // Search partitions using RD/NONRD criterion. SEARCH_PARTITION, - // Always use a fixed size partition + // Always use a fixed size partition. FIXED_PARTITION, REFERENCE_PARTITION, // Use an arbitrary partitioning scheme based on source variance within - // a 64X64 SB + // a 64X64 SB. VAR_BASED_PARTITION, - // Use non-fixed partitions based on source variance - SOURCE_VAR_BASED_PARTITION + // Use non-fixed partitions based on source variance. + SOURCE_VAR_BASED_PARTITION, + +#if CONFIG_ML_VAR_PARTITION + // Make partition decisions with machine learning models. + ML_BASED_PARTITION +#endif // CONFIG_ML_VAR_PARTITION } PARTITION_SEARCH_TYPE; typedef enum { @@ -238,6 +243,12 @@ typedef enum { RE_ENCODE_MAXQ = 2 } OVERSHOOT_DETECTION_CBR_RT; +typedef enum { + USE_2_TAPS = 0, + USE_4_TAPS, + USE_8_TAPS, +} SUBPEL_SEARCH_TYPE; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -510,6 +521,10 @@ typedef struct SPEED_FEATURES { // Machine-learning based partition search early termination int ml_partition_search_early_termination; + // Machine-learning based partition search pruning using prediction residue + // variance. + int ml_var_partition_pruning; + // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -577,6 +592,10 @@ typedef struct SPEED_FEATURES { // Allow for disabling golden reference. int disable_golden_ref; + + // Allow sub-pixel search to use interpolation filters with different taps in + // order to achieve accurate motion search result. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 51668d01d..7ac70c8ea 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -421,12 +421,13 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /* restore UMV window */ x->mv_limits = tmp_mv_limits; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. // Ignore mv costing by sending NULL pointer instead of cost array bestsme = cpi->find_fractional_mv_step( x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); // Restore input state x->plane[0].src = src; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index fdff87768..6a4cb9acf 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -270,6 +270,9 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { RANGE_CHECK(ctx, row_mt, 0, 1); ctx->pbi->row_mt = ctx->row_mt; + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; + // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) @@ -658,6 +661,13 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -670,6 +680,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, // Getters { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vp9/vp9_dx_iface.h b/vp9/vp9_dx_iface.h index a1c335278..f60688c4d 100644 --- a/vp9/vp9_dx_iface.h +++ b/vp9/vp9_dx_iface.h @@ -46,6 +46,7 @@ struct vpx_codec_alg_priv { int svc_decoding; int svc_spatial_layer; int row_mt; + int lpf_opt; }; #endif // VPX_VP9_VP9_DX_IFACE_H_ |