diff options
author | Supradeep T R <supradeep.tr@ittiam.com> | 2018-06-12 13:57:39 +0530 |
---|---|---|
committer | Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com> | 2018-08-20 12:07:37 +0530 |
commit | dafe064289a917977439ab6f4f002b9946496084 (patch) | |
tree | 46caa91fc7a6b9811234ed9f71a0e8534196bd02 /vp9/common | |
parent | f1d44c1f45bb0828ef4a683ad42544b3c6fc3af1 (diff) | |
download | libvpx-dafe064289a917977439ab6f4f002b9946496084.tar libvpx-dafe064289a917977439ab6f4f002b9946496084.tar.gz libvpx-dafe064289a917977439ab6f4f002b9946496084.tar.bz2 libvpx-dafe064289a917977439ab6f4f002b9946496084.zip |
Loopfilter MultiThread Optimization
Adding LPF within the tileworker hook. This means that LPF will be done
immediately after decode, without waiting for all threads to sync.
Performance Improvement -
Platform Resolution 2 Threads 4 Threads
X86 720p 7.24% 22.04%
1080p 5.29% 17.02%
ARM 720p 4.61% 8.75%
1080p 5.55% 12.03%
x86 Improvement measured on Intel Core i7-6700 CPU @ 2.10GHz set
in performance with turbo mode off
ARM Improvement measured on Nexus 6 Snapdragon 805 Quad-core @ 2.65 GHz
Change-Id: Ifa73c71b40db3fa7fa16f54f4e3aa06d1258caae
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 2 | ||||
-rw-r--r-- | vp9/common/vp9_thread_common.c | 147 | ||||
-rw-r--r-- | vp9/common/vp9_thread_common.h | 18 |
3 files changed, 167 insertions, 0 deletions
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 1d96d92c2..e053e2ee0 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -256,6 +256,8 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index e0f5e0d83..dc9aa405a 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -229,6 +229,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -266,6 +286,25 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, pthread_cond_init(&lf_sync->cond_[i], NULL); } } + pthread_mutex_init(&lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + int i; + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + int i; + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); + } + } } #endif // CONFIG_MULTITHREAD @@ -276,6 +315,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } @@ -298,15 +342,118 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { } vpx_free(lf_sync->cond_); } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + pthread_mutex_destroy(&lf_sync->lf_mutex); + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); + } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. vp9_zero(*lf_sync); } } +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + int cur_row; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(&lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(&lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync, + MACROBLOCKD *xd) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while (!xd->corrupted && (mi_row = get_next_row(cm, lf_sync)) != -1 && + mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; +#endif // CONFIG_MULTITHREAD +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h index 0f7c3ff74..09609f821 100644 --- a/vp9/common/vp9_thread_common.h +++ b/vp9/common/vp9_thread_common.h @@ -37,6 +37,13 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +60,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync, + MACROBLOCKD *xd); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row); + +void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); |