summaryrefslogtreecommitdiff
path: root/vp9/common
diff options
context:
space:
mode:
authorSupradeep T R <supradeep.tr@ittiam.com>2018-06-12 13:57:39 +0530
committerVenkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>2018-08-20 12:07:37 +0530
commitdafe064289a917977439ab6f4f002b9946496084 (patch)
tree46caa91fc7a6b9811234ed9f71a0e8534196bd02 /vp9/common
parentf1d44c1f45bb0828ef4a683ad42544b3c6fc3af1 (diff)
downloadlibvpx-dafe064289a917977439ab6f4f002b9946496084.tar
libvpx-dafe064289a917977439ab6f4f002b9946496084.tar.gz
libvpx-dafe064289a917977439ab6f4f002b9946496084.tar.bz2
libvpx-dafe064289a917977439ab6f4f002b9946496084.zip
Loopfilter MultiThread Optimization
Adding LPF within the tileworker hook. This means that LPF will be done immediately after decode, without waiting for all threads to sync. Performance Improvement - Platform Resolution 2 Threads 4 Threads X86 720p 7.24% 22.04% 1080p 5.29% 17.02% ARM 720p 4.61% 8.75% 1080p 5.55% 12.03% x86 Improvement measured on Intel Core i7-6700 CPU @ 2.10GHz set in performance with turbo mode off ARM Improvement measured on Nexus 6 Snapdragon 805 Quad-core @ 2.65 GHz Change-Id: Ifa73c71b40db3fa7fa16f54f4e3aa06d1258caae
Diffstat (limited to 'vp9/common')
-rw-r--r--vp9/common/vp9_onyxc_int.h2
-rw-r--r--vp9/common/vp9_thread_common.c147
-rw-r--r--vp9/common/vp9_thread_common.h18
3 files changed, 167 insertions, 0 deletions
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 1d96d92c2..e053e2ee0 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -256,6 +256,8 @@ typedef struct VP9Common {
PARTITION_CONTEXT *above_seg_context;
ENTROPY_CONTEXT *above_context;
int above_context_alloc_cols;
+
+ int lf_row;
} VP9_COMMON;
static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index e0f5e0d83..dc9aa405a 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -229,6 +229,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
workers, num_workers, lf_sync);
}
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
+ int num_workers) {
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+
+ if (!frame_filter_level) return;
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_workers > lf_sync->num_workers) {
+ vp9_loop_filter_dealloc(lf_sync);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+ }
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+ memset(lf_sync->num_tiles_done, 0,
+ sizeof(*lf_sync->num_tiles_done) * sb_rows);
+ cm->lf_row = 0;
+}
+
// Set up nsync by width.
static INLINE int get_sync_range(int width) {
// nsync numbers are picked by testing. For example, for 4k
@@ -266,6 +286,25 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
pthread_cond_init(&lf_sync->cond_[i], NULL);
}
}
+ pthread_mutex_init(&lf_sync->lf_mutex, NULL);
+
+ CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+ vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
+ if (lf_sync->recon_done_mutex) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+ vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
+ if (lf_sync->recon_done_cond) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
+ }
+ }
}
#endif // CONFIG_MULTITHREAD
@@ -276,6 +315,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+ CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+ vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
+ mi_cols_aligned_to_sb(cm->mi_rows) >>
+ MI_BLOCK_SIZE_LOG2));
+
// Set up nsync.
lf_sync->sync_range = get_sync_range(width);
}
@@ -298,15 +342,118 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
}
vpx_free(lf_sync->cond_);
}
+ if (lf_sync->recon_done_mutex != NULL) {
+ int i;
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
+ }
+ vpx_free(lf_sync->recon_done_mutex);
+ }
+
+ pthread_mutex_destroy(&lf_sync->lf_mutex);
+ if (lf_sync->recon_done_cond != NULL) {
+ int i;
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
+ }
+ vpx_free(lf_sync->recon_done_cond);
+ }
#endif // CONFIG_MULTITHREAD
+
vpx_free(lf_sync->lfdata);
vpx_free(lf_sync->cur_sb_col);
+ vpx_free(lf_sync->num_tiles_done);
// clear the structure as the source of this call may be a resize in which
// case this call will be followed by an _alloc() which may fail.
vp9_zero(*lf_sync);
}
}
+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
+ int return_val = -1;
+ int cur_row;
+ const int max_rows = cm->mi_rows;
+
+#if CONFIG_MULTITHREAD
+ const int tile_cols = 1 << cm->log2_tile_cols;
+
+ pthread_mutex_lock(&lf_sync->lf_mutex);
+ if (cm->lf_row < max_rows) {
+ cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+ return_val = cm->lf_row;
+ cm->lf_row += MI_BLOCK_SIZE;
+ if (cm->lf_row < max_rows) {
+ /* If this is not the last row, make sure the next row is also decoded.
+ * This is because the intra predict has to happen before loop filter */
+ cur_row += 1;
+ }
+ }
+ pthread_mutex_unlock(&lf_sync->lf_mutex);
+
+ if (return_val == -1) return return_val;
+
+ pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
+ if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
+ pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
+ &lf_sync->recon_done_mutex[cur_row]);
+ }
+ pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
+#else
+ (void)lf_sync;
+ if (cm->lf_row < max_rows) {
+ cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+ return_val = cm->lf_row;
+ cm->lf_row += MI_BLOCK_SIZE;
+ if (cm->lf_row < max_rows) {
+ /* If this is not the last row, make sure the next row is also decoded.
+ * This is because the intra predict has to happen before loop filter */
+ cur_row += 1;
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ return return_val;
+}
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,
+ MACROBLOCKD *xd) {
+ int mi_row;
+ VP9_COMMON *cm = lf_data->cm;
+
+ while (!xd->corrupted && (mi_row = get_next_row(cm, lf_sync)) != -1 &&
+ mi_row < cm->mi_rows) {
+ lf_data->start = mi_row;
+ lf_data->stop = mi_row + MI_BLOCK_SIZE;
+
+ thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+ lf_data->start, lf_data->stop, lf_data->y_only,
+ lf_sync);
+ }
+}
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+ lf_sync->num_tiles_done[row] += 1;
+ if (num_tiles == lf_sync->num_tiles_done[row]) {
+ if (is_last_row) {
+ /* The last 2 rows wait on the last row to be done.
+ * So, we have to broadcast the signal in this case.
+ */
+ pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
+ } else {
+ pthread_cond_signal(&lf_sync->recon_done_cond[row]);
+ }
+ }
+ pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+#else
+ (void)lf_sync;
+ (void)num_tiles;
+ (void)row;
+ (void)is_last_row;
+#endif // CONFIG_MULTITHREAD
+}
+
// Accumulate frame counts.
void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
const FRAME_COUNTS *counts, int is_dec) {
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index 0f7c3ff74..09609f821 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -37,6 +37,13 @@ typedef struct VP9LfSyncData {
// Row-based parallel loopfilter data
LFWorkerData *lfdata;
int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t lf_mutex;
+ pthread_mutex_t *recon_done_mutex;
+ pthread_cond_t *recon_done_cond;
+#endif
+ int *num_tiles_done;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
@@ -53,6 +60,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
int partial_frame, VPxWorker *workers,
int num_workers, VP9LfSync *lf_sync);
+// Multi-threaded loopfilter initialisations
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,
+ int frame_filter_level, int num_workers);
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,
+ MACROBLOCKD *xd);
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row);
+
+void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);
+
void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
const struct FRAME_COUNTS *counts, int is_dec);