summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRanjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>2016-12-27 18:45:43 +0530
committerYunqing Wang <yunqingwang@google.com>2017-01-24 15:48:02 -0800
commit8b0c11c3588963fa02be0cad36a6a23cdb748cf9 (patch)
treea9dbfb4cc4adbfaea38f73ec4e73099642e58c25
parent91aa1fae2a88e0a125161dc1ec3f8f73c3508707 (diff)
downloadlibvpx-8b0c11c3588963fa02be0cad36a6a23cdb748cf9.tar
libvpx-8b0c11c3588963fa02be0cad36a6a23cdb748cf9.tar.gz
libvpx-8b0c11c3588963fa02be0cad36a6a23cdb748cf9.tar.bz2
libvpx-8b0c11c3588963fa02be0cad36a6a23cdb748cf9.zip
Multi-threading of first pass stats collection
(yunqingwang) 1. Rebased the patch. Incorporated recent first pass changes. 2. Turned on the first pass unit test. Change-Id: Ia2f7ba8152d0b6dd6bf8efb9dfaf505ba7d8edee
-rw-r--r--test/vp9_ethread_test.cc9
-rw-r--r--vp9/encoder/vp9_block.h4
-rw-r--r--vp9/encoder/vp9_encodemb.c7
-rw-r--r--vp9/encoder/vp9_encoder.c21
-rw-r--r--vp9/encoder/vp9_encoder.h46
-rw-r--r--vp9/encoder/vp9_ethread.c324
-rw-r--r--vp9/encoder/vp9_ethread.h35
-rw-r--r--vp9/encoder/vp9_firstpass.c1120
-rw-r--r--vp9/encoder/vp9_firstpass.h47
-rw-r--r--vp9/encoder/vp9_job_queue.h46
-rw-r--r--vp9/encoder/vp9_multi_thread.c282
-rw-r--r--vp9/encoder/vp9_multi_thread.h38
-rw-r--r--vp9/vp9_cx_iface.c13
-rw-r--r--vp9/vp9cx.mk3
-rw-r--r--vpx/vp8cx.h11
-rw-r--r--vpxenc.c5
16 files changed, 1496 insertions, 515 deletions
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index ee522565c..d7799301d 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -82,9 +82,8 @@ class VPxFirstPassEncoderThreadTest
encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
// For now, new_mt_mode only works for 2-pass encoding.
- // Enable this once the fp mt patch is checked in.
- // if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
- // encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
+ if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
+ encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
encoder_initialized_ = true;
}
@@ -131,7 +130,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
for (j = 0; j < kDbl; ++j) {
EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),
- fabs(*frame_stats1) / 1000.0);
+ fabs(*frame_stats1) / 10000.0);
frame_stats1++;
frame_stats2++;
}
@@ -146,7 +145,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
}
TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
- ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 50);
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
first_pass_only_ = 1;
cfg_.rc_target_bitrate = 1000;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 0d5075ca9..91d07e3a0 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -131,6 +131,10 @@ struct macroblock {
int use_lp32x32fdct;
int skip_encode;
+ // In first pass, intra prediction is done based on source pixels
+ // at tile boundaries
+ int fp_src_pred;
+
// use fast quantization process
int quant_fp;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2cb137d8b..1dc8d34d5 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -773,9 +773,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
}
}
- vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
- x->skip_encode ? src_stride : dst_stride, dst,
- dst_stride, col, row, plane);
+ vp9_predict_intra_block(
+ xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
+ (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
+ dst_stride, col, row, plane);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 26326fcf0..5dbe62db5 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -50,6 +50,7 @@
#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_multi_thread.h"
#include "vp9/encoder/vp9_noise_estimate.h"
#include "vp9/encoder/vp9_picklpf.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -1563,6 +1564,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
#endif
+
+ // Enable multi-threading for first pass.
+ cpi->new_mt = 0;
+ if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+ cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+ cpi->oxcf.new_mt)
+ cpi->new_mt = 1;
}
#ifndef M_LOG2_E
@@ -1719,6 +1727,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
}
#endif
+#if ENABLE_MT_BIT_MATCH
+ CHECK_MEM_ERROR(
+ cm, cpi->twopass.fp_mb_float_stats,
+ vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
+#endif
+
cpi->refresh_alt_ref_frame = 0;
cpi->multi_arf_last_grp_enabled = 0;
@@ -2076,6 +2090,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
vpx_free(cpi->tile_thr_data);
vpx_free(cpi->workers);
+ vp9_row_mt_mem_dealloc(cpi);
if (cpi->num_workers > 1) {
vp9_loop_filter_dealloc(&cpi->lf_row_sync);
@@ -2098,6 +2113,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
}
#endif
+#if ENABLE_MT_BIT_MATCH
+ vpx_free(cpi->twopass.fp_mb_float_stats);
+ cpi->twopass.fp_mb_float_stats = NULL;
+#endif
+
vp9_remove_common(cm);
vp9_free_ref_frame_buffers(cm->buffer_pool);
#if CONFIG_VP9_POSTPROC
@@ -4802,6 +4822,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
}
+ cpi->td.mb.fp_src_pred = 0;
if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 00552e1cc..e1046f14a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -33,7 +33,9 @@
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_job_queue.h"
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/encoder/vp9_mbgraph.h"
#include "vp9/encoder/vp9_mcomp.h"
@@ -256,6 +258,8 @@ typedef struct VP9EncoderConfig {
int render_width;
int render_height;
VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+
+ int new_mt;
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -269,8 +273,34 @@ typedef struct TileDataEnc {
int mode_map[BLOCK_SIZES][MAX_MODES];
int m_search_count;
int ex_search_count;
+ FIRSTPASS_DATA fp_data;
+ VP9RowMTSync row_mt_sync;
} TileDataEnc;
+typedef struct RowMTInfo {
+ JobQueueHandle job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t job_mutex;
+#endif
+} RowMTInfo;
+
+typedef struct MultiThreadHandle {
+ int allocated_tile_rows;
+ int allocated_tile_cols;
+ int allocated_vert_unit_rows;
+
+ // Frame level params
+ int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
+
+ // Job Queue structure and handles
+ JobQueue *job_queue;
+
+ int jobs_per_tile_col;
+
+ RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
+ int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles
+} MultiThreadHandle;
+
typedef struct RD_COUNTS {
vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
int64_t comp_pred_diff[REFERENCE_MODES];
@@ -629,6 +659,10 @@ typedef struct VP9_COMP {
int keep_level_stats;
Vp9LevelInfo level_info;
+ MultiThreadHandle multi_thread_ctxt;
+ void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
+ void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
+ int new_mt;
// Previous Partition Info
BLOCK_SIZE *prev_partition;
@@ -808,6 +842,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
}
+static INLINE int get_num_vert_units(TileInfo tile, int shift) {
+ int num_vert_units =
+ (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
+ return num_vert_units;
+}
+
+static INLINE int get_num_cols(TileInfo tile, int shift) {
+ int num_cols =
+ (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
+ return num_cols;
+}
+
static INLINE int get_level_index(VP9_LEVEL level) {
int i;
for (i = 0; i < VP9_LEVELS; ++i) {
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index f4f7c7bac..6cc103c89 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -11,6 +11,8 @@
#include "vp9/encoder/vp9_encodeframe.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_multi_thread.h"
#include "vpx_dsp/vpx_dsp_common.h"
static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
@@ -64,15 +66,11 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
return (1 << log2_tile_cols);
}
-void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
VP9_COMMON *const cm = &cpi->common;
- const int tile_cols = 1 << cm->log2_tile_cols;
const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
- const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
int i;
- vp9_init_tile_data(cpi);
-
// Only run once to create threads and allocate thread data.
if (cpi->num_workers == 0) {
int allocated_workers = num_workers;
@@ -123,19 +121,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
thread_data->cpi = cpi;
thread_data->td = &cpi->td;
}
-
winterface->sync(worker);
}
}
+}
+
+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
+ int num_workers) {
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ int i;
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
- EncWorkerData *thread_data;
-
- worker->hook = (VPxWorkerHook)enc_worker_hook;
+ worker->hook = (VPxWorkerHook)hook;
worker->data1 = &cpi->tile_thr_data[i];
- worker->data2 = NULL;
- thread_data = (EncWorkerData *)worker->data1;
+ worker->data2 = data2;
+ }
+
+ // Encode a frame
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+
+ // Encoding ends.
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+ int i;
+
+ vp9_init_tile_data(cpi);
+
+ create_enc_workers(cpi, num_workers);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
// Before encoding a frame, copy the thread data from cpi.
if (thread_data->td != &cpi->td) {
@@ -165,34 +201,266 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
}
}
- // Encode a frame
+ launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);
+
for (i = 0; i < num_workers; i++) {
VPxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
- // Set the starting tile for each thread.
- thread_data->start = i;
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
+}
- if (i == cpi->num_workers - 1)
- winterface->execute(worker);
- else
- winterface->launch(worker);
+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
+ TileDataEnc *tile_data_t) {
+ tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
+ tile_data->fp_data.brightness_factor +=
+ tile_data_t->fp_data.brightness_factor;
+ tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
+ tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
+ tile_data->fp_data.frame_noise_energy +=
+ tile_data_t->fp_data.frame_noise_energy;
+ tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
+ tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
+ tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
+ tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
+ tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
+ tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+ tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
+ tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
+ tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
+ tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
+ tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
+ tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
+ tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
+ tile_data->fp_data.intra_smooth_count +=
+ tile_data_t->fp_data.intra_smooth_count;
+ tile_data->fp_data.image_data_start_row =
+ VPXMIN(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row) == INVALID_ROW
+ ? VPXMAX(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row)
+ : VPXMIN(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row);
+}
+
+// Allocate memory for row synchronization
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
+ int rows) {
+ row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+ vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+ if (row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+ vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));
+ if (row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+ }
+ }
}
+#endif // CONFIG_MULTITHREAD
- // Encoding ends.
- for (i = 0; i < num_workers; i++) {
- VPxWorker *const worker = &cpi->workers[i];
- winterface->sync(worker);
+ CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+ vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+ // Set up nsync.
+ row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
+ if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ if (row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+ }
+ vpx_free(row_mt_sync->mutex_);
+ }
+ if (row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_cond_destroy(&row_mt_sync->cond_[i]);
+ }
+ vpx_free(row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ vpx_free(row_mt_sync->cur_col);
+ // clear the structure as the source of this call may be dynamic change
+ // in tiles in which case this call will be followed by an _alloc()
+ // which may fail.
+ vp9_zero(*row_mt_sync);
}
+}
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > row_mt_sync->cur_col[r - 1] - nsync) {
+ pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ return;
+}
+
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+ row_mt_sync->cur_col[r] = cur;
+
+ pthread_cond_signal(&row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+ return;
+}
+
+static int first_pass_worker_hook(EncWorkerData *const thread_data,
+ MultiThreadHandle *multi_thread_ctxt) {
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int tile_row, tile_col;
+ TileDataEnc *this_tile;
+ int end_of_frame;
+ int thread_id = thread_data->thread_id;
+ int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+ JobNode *proc_job = NULL;
+ FIRSTPASS_DATA fp_acc_data;
+ MV zero_mv = { 0, 0 };
+ MV best_ref_mv;
+ int mb_row;
+
+ end_of_frame = 0;
+ while (0 == end_of_frame) {
+ // Get the next job in the queue
+ proc_job =
+ (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+ if (NULL == proc_job) {
+ // Query for the status of other tiles
+ end_of_frame = vp9_get_tiles_proc_status(
+ multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+ tile_cols);
+ } else {
+ tile_col = proc_job->tile_col_id;
+ tile_row = proc_job->tile_row_id;
+
+ this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ mb_row = proc_job->vert_unit_row_num;
+
+ best_ref_mv = zero_mv;
+ vp9_zero(fp_acc_data);
+ fp_acc_data.image_data_start_row = INVALID_ROW;
+ vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
+ this_tile, &best_ref_mv, mb_row);
+ }
+ }
+ return 0;
+}
+
+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ TileDataEnc *first_tile_col;
+ int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+ int i;
+
+ if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+ multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+ multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_init_tile_data(cpi);
+ vp9_row_mt_mem_alloc(cpi);
+ } else {
+ vp9_init_tile_data(cpi);
+ }
+
+ create_enc_workers(cpi, num_workers);
+
+ vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+ vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
+
+ vp9_multi_thread_tile_init(cpi);
for (i = 0; i < num_workers; i++) {
- VPxWorker *const worker = &cpi->workers[i];
- EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
- // Accumulate counters.
- if (i < cpi->num_workers - 1) {
- vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
- accumulate_rd_opt(&cpi->td, thread_data->td);
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
}
}
+
+ launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
+ multi_thread_ctxt, num_workers);
+
+ first_tile_col = &cpi->tile_data[0];
+ for (i = 1; i < tile_cols; i++) {
+ TileDataEnc *this_tile = &cpi->tile_data[i];
+ accumulate_fp_tile_stat(first_tile_col, this_tile);
+ }
}
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
index 1efa4dcde..968e500fb 100644
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -15,6 +15,10 @@
extern "C" {
#endif
+#define MAX_NUM_TILE_COLS (1 << 6)
+#define MAX_NUM_TILE_ROWS 4
+#define MAX_NUM_THREADS 80
+
struct VP9_COMP;
struct ThreadData;
@@ -22,10 +26,41 @@ typedef struct EncWorkerData {
struct VP9_COMP *cpi;
struct ThreadData *td;
int start;
+ int thread_id;
+ int tile_completion_status[MAX_NUM_TILE_COLS];
} EncWorkerData;
+// Encoder row synchronization
+typedef struct VP9RowMTSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ // Allocate memory to store the sb/mb block index in each row.
+ int *cur_col;
+ int sync_range;
+ int rows;
+} VP9RowMTSync;
+
void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols);
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols);
+
+// Allocate memory for row based multi-threading synchronization.
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
+ int rows);
+
+// Deallocate row based multi-threading synchronization related mutex and data.
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 72e9ac77e..59dd53697 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -31,6 +31,7 @@
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
#include "vp9/encoder/vp9_extend.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_mcomp.h"
@@ -646,37 +647,150 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) {
return block_noise << 2; // Scale << 2 to account for sampling.
}
-#define INVALID_ROW -1
-void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+#if ENABLE_MT_BIT_MATCH
+static void accumulate_floating_point_stats(VP9_COMP *cpi,
+ TileDataEnc *first_tile_col) {
+ VP9_COMMON *const cm = &cpi->common;
int mb_row, mb_col;
- MACROBLOCK *const x = &cpi->td.mb;
+ first_tile_col->fp_data.intra_factor = 0;
+ first_tile_col->fp_data.brightness_factor = 0;
+ first_tile_col->fp_data.neutral_count = 0;
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+ first_tile_col->fp_data.intra_factor +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;
+ first_tile_col->fp_data.brightness_factor +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;
+ first_tile_col->fp_data.neutral_count +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;
+ }
+ }
+}
+#endif
+
+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
+ FIRSTPASS_DATA *fp_acc_data) {
+ VP9_COMMON *const cm = &cpi->common;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const double min_err = 200 * sqrt(num_mbs);
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||
+ (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+ fp_acc_data->image_data_start_row = cm->mb_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (fp_acc_data->image_data_start_row > 0) {
+ fp_acc_data->intra_skip_count =
+ VPXMAX(0, fp_acc_data->intra_skip_count -
+ (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+ }
+
+ fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
+ fp_acc_data->brightness_factor =
+ fp_acc_data->brightness_factor / (double)num_mbs;
+ fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;
+
+ fps->frame = cm->current_video_frame;
+ fps->spatial_layer_id = cpi->svc.spatial_layer_id;
+ fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;
+ fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;
+ fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;
+ fps->frame_noise_energy =
+ (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
+ fps->count = 1.0;
+ fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;
+ fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;
+ fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;
+ fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;
+ fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;
+ fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);
+ // Currently set to 0 as most issues relate to letter boxing.
+ fps->inactive_zone_cols = (double)0;
+
+ if (fp_acc_data->mvcount > 0) {
+ fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
+ fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
+ fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
+ fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;
+ fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -
+ ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /
+ fp_acc_data->mvcount)) /
+ fp_acc_data->mvcount;
+ fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -
+ ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /
+ fp_acc_data->mvcount)) /
+ fp_acc_data->mvcount;
+ fps->mv_in_out_count =
+ (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
+ fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
+ } else {
+ fps->MVr = 0.0;
+ fps->mvr_abs = 0.0;
+ fps->MVc = 0.0;
+ fps->mvc_abs = 0.0;
+ fps->MVrv = 0.0;
+ fps->MVcv = 0.0;
+ fps->mv_in_out_count = 0.0;
+ fps->pcnt_motion = 0.0;
+ }
+}
+
+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
+ FIRSTPASS_DATA *fp_acc_data) {
+ this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;
+ this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;
+ this_tile->fp_data.coded_error += fp_acc_data->coded_error;
+ this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;
+ this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;
+ this_tile->fp_data.intra_error += fp_acc_data->intra_error;
+ this_tile->fp_data.intercount += fp_acc_data->intercount;
+ this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;
+ this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;
+ this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+ this_tile->fp_data.mvcount += fp_acc_data->mvcount;
+ this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
+ this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
+ this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;
+ this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;
+ this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;
+ this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;
+ this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;
+ this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;
+ this_tile->fp_data.image_data_start_row =
+ VPXMIN(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row) == INVALID_ROW
+ ? VPXMAX(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row)
+ : VPXMIN(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row);
+}
+
+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
+ FIRSTPASS_DATA *fp_acc_data,
+ TileDataEnc *tile_data, MV *best_ref_mv,
+ int mb_row) {
+ int mb_col;
+ MACROBLOCK *const x = &td->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- TileInfo tile;
+ TileInfo tile = tile_data->tile_info;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
- int i;
+ const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
+ int i, c;
+ int num_mb_cols = get_num_cols(tile_data->tile_info, 1);
int recon_yoffset, recon_uvoffset;
- int64_t intra_error = 0;
- int64_t coded_error = 0;
- int64_t sr_coded_error = 0;
- int64_t frame_noise_energy = 0;
-
- int sum_mvr = 0, sum_mvc = 0;
- int sum_mvr_abs = 0, sum_mvc_abs = 0;
- int64_t sum_mvrs = 0, sum_mvcs = 0;
- int mvcount = 0;
- int intercount = 0;
- int second_ref_count = 0;
const int intrapenalty = INTRA_MODE_PENALTY;
- double neutral_count;
- int intra_skip_count = 0;
- int intra_smooth_count = 0;
- int image_data_start_row = INVALID_ROW;
- int sum_in_vectors = 0;
- TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = { 0, 0 };
int recon_y_stride, recon_uv_stride, uv_mb_height;
@@ -688,50 +802,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
LAYER_CONTEXT *const lc =
is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
: NULL;
- double intra_factor;
- double brightness_factor;
- BufferPool *const pool = cm->buffer_pool;
MODE_INFO mi_above, mi_left;
// First pass code requires valid last and new frame buffers.
assert(new_yv12 != NULL);
assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
- }
-#endif
-
- vpx_clear_system_state();
-
- intra_factor = 0.0;
- brightness_factor = 0.0;
- neutral_count = 0.0;
-
- set_first_pass_params(cpi);
- vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
-
if (lc != NULL) {
- twopass = &lc->twopass;
-
- cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
- cpi->ref_frame_flags = VP9_LAST_FLAG;
-
- if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
- REF_FRAMES) {
- cpi->gld_fb_idx =
- cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
- cpi->ref_frame_flags |= VP9_GOLD_FLAG;
- cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
- } else {
- cpi->refresh_golden_frame = 0;
- }
-
- if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
-
- vp9_scale_references(cpi);
-
// Use either last frame or alt frame for motion search.
if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
@@ -747,28 +824,11 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
} else {
gld_yv12 = NULL;
}
-
- set_ref_ptrs(cm, xd,
- (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
- (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
-
- cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
- &cpi->scaled_source, 0);
}
- vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
- vp9_setup_src_planes(x, cpi->Source, 0, 0);
- vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
-
- if (!frame_is_intra_only(cm)) {
- vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
- }
-
- xd->mi = cm->mi_grid_visible;
- xd->mi[0] = cm->mi;
-
- vp9_frame_init_quantizer(cpi);
+ xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +
+ (tile.mi_col_start >> 1);
+ xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);
for (i = 0; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][1];
@@ -776,276 +836,363 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
p[i].eobs = ctx->eobs_pbuf[i][1];
}
- x->skip_recode = 0;
-
- vp9_init_mv_probs(cm);
- vp9_initialize_rd_consts(cpi);
-
- // Tiling is ignored in the first pass.
- vp9_tile_init(&tile, cm, 0, 0);
recon_y_stride = new_yv12->y_stride;
recon_uv_stride = new_yv12->uv_stride;
uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
- for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
- MV best_ref_mv = { 0, 0 };
+ // Reset above block coeffs.
+ recon_yoffset =
+ (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;
+ recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +
+ (tile.mi_col_start >> 1) * uv_mb_height;
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.row_max =
+ ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+ for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);
+ ++mb_col, c++) {
+ int this_error;
+ int this_intra_error;
+ const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+ const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+ double log_intra;
+ int level_sample;
+#if ENABLE_MT_BIT_MATCH
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
- // Reset above block coeffs.
- recon_yoffset = (mb_row * recon_y_stride * 16);
- recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
- // Set up limit values for motion vectors to prevent them extending
- // outside the UMV borders.
- x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
- x->mv_limits.row_max =
- ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+ (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c - 1);
- for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
- int this_error;
- int this_intra_error;
- const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
- const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
- double log_intra;
- int level_sample;
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf = cpi->Source->y_buffer +
+ mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
+ x->plane[1].src.buf = cpi->Source->u_buffer +
+ mb_row * uv_mb_height * x->plane[1].src.stride +
+ mb_col * uv_mb_height;
+ x->plane[2].src.buf = cpi->Source->v_buffer +
+ mb_row * uv_mb_height * x->plane[1].src.stride +
+ mb_col * uv_mb_height;
-#if CONFIG_FP_MB_STATS
- const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
+ vpx_clear_system_state();
- vpx_clear_system_state();
-
- xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
- xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
- xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
- xd->mi[0]->sb_type = bsize;
- xd->mi[0]->ref_frame[0] = INTRA_FRAME;
- set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
- mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
- cm->mi_rows, cm->mi_cols);
- // Are edges available for intra prediction?
- // Since the firstpass does not populate the mi_grid_visible,
- // above_mi/left_mi must be overwritten with a nonzero value when edges
- // are available. Required by vp9_predict_intra_block().
- xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
- xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL;
-
- // Do intra 16x16 prediction.
- x->skip_encode = 0;
- xd->mi[0]->mode = DC_PRED;
- xd->mi[0]->tx_size =
- use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-
- // Set the 16x16 src_diff block to zero, which ensures correct this_error
- // calculation for block sizes smaller than 16x16.
- vp9_zero_array(x->plane[0].src_diff, 256);
- vp9_encode_intra_block_plane(x, bsize, 0, 0);
- this_error = vpx_get_mb_ss(x->plane[0].src_diff);
- this_intra_error = this_error;
-
- // Keep a record of blocks that have very low intra error residual
- // (i.e. are in effect completely flat and untextured in the intra
- // domain). In natural videos this is uncommon, but it is much more
- // common in animations, graphics and screen content, so may be used
- // as a signal to detect these types of content.
- if (this_error < get_ul_intra_threshold(cm)) {
- ++intra_skip_count;
- } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
- image_data_start_row = mb_row;
- }
+ xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+ xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+ xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+ mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,
+ cm->mi_cols);
+ // Are edges available for intra prediction?
+ // Since the firstpass does not populate the mi_grid_visible,
+ // above_mi/left_mi must be overwritten with a nonzero value when edges
+ // are available. Required by vp9_predict_intra_block().
+ xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+ xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;
+
+ // Do intra 16x16 prediction.
+ x->skip_encode = 0;
+ x->fp_src_pred = 0;
+ // Do intra prediction based on source pixels for tile boundaries
+ if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {
+ xd->left_mi = &mi_left;
+ x->fp_src_pred = 1;
+ }
+ xd->mi[0]->mode = DC_PRED;
+ xd->mi[0]->tx_size =
+ use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+ // Fix - zero the 16x16 block first. This ensures correct this_error for
+ // block sizes smaller than 16x16.
+ vp9_zero_array(x->plane[0].src_diff, 256);
+ vp9_encode_intra_block_plane(x, bsize, 0, 0);
+ this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+ this_intra_error = this_error;
+
+ // Keep a record of blocks that have very low intra error residual
+ // (i.e. are in effect completely flat and untextured in the intra
+ // domain). In natural videos this is uncommon, but it is much more
+ // common in animations, graphics and screen content, so may be used
+ // as a signal to detect these types of content.
+ if (this_error < get_ul_intra_threshold(cm)) {
+ ++(fp_acc_data->intra_skip_count);
+ } else if ((mb_col > 0) &&
+ (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+ fp_acc_data->image_data_start_row = mb_row;
+ }
- // Blocks that are mainly smooth in the intra domain.
- // Some special accounting for CQ but also these are better for testing
- // noise levels.
- if (this_error < get_smooth_intra_threshold(cm)) {
- ++intra_smooth_count;
- }
+ // Blocks that are mainly smooth in the intra domain.
+ // Some special accounting for CQ but also these are better for testing
+ // noise levels.
+ if (this_error < get_smooth_intra_threshold(cm)) {
+ ++(fp_acc_data->intra_smooth_count);
+ }
- // Special case noise measurement for first frame.
- if (cm->current_video_frame == 0) {
- if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
- frame_noise_energy += fp_estimate_block_noise(x, bsize);
- } else {
- frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
- }
+ // Special case noise measurement for first frame.
+ if (cm->current_video_frame == 0) {
+ if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ } else {
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
}
+ }
#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- switch (cm->bit_depth) {
- case VPX_BITS_8: break;
- case VPX_BITS_10: this_error >>= 4; break;
- case VPX_BITS_12: this_error >>= 8; break;
- default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
- return;
- }
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8: break;
+ case VPX_BITS_10: this_error >>= 4; break;
+ case VPX_BITS_12: this_error >>= 8; break;
+ default:
+ assert(0 &&
+ "cm->bit_depth should be VPX_BITS_8, "
+ "VPX_BITS_10 or VPX_BITS_12");
+ return;
}
+ }
#endif // CONFIG_VP9_HIGHBITDEPTH
- vpx_clear_system_state();
- log_intra = log(this_error + 1.0);
- if (log_intra < 10.0)
- intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
- else
- intra_factor += 1.0;
+ vpx_clear_system_state();
+ log_intra = log(this_error + 1.0);
+ if (log_intra < 10.0) {
+ fp_acc_data->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
+ 1.0 + ((10.0 - log_intra) * 0.05);
+#endif
+ } else {
+ fp_acc_data->intra_factor += 1.0;
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
+#endif
+ }
#if CONFIG_VP9_HIGHBITDEPTH
- if (cm->use_highbitdepth)
- level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
- else
- level_sample = x->plane[0].src.buf[0];
-#else
+ if (cm->use_highbitdepth)
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ else
level_sample = x->plane[0].src.buf[0];
+#else
+ level_sample = x->plane[0].src.buf[0];
#endif
- if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
- brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
- else
- brightness_factor += 1.0;
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+ fp_acc_data->brightness_factor +=
+ 1.0 + (0.01 * (DARK_THRESH - level_sample));
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+ 1.0 + (0.01 * (DARK_THRESH - level_sample));
+#endif
+ } else {
+ fp_acc_data->brightness_factor += 1.0;
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = 1.0;
+#endif
+ }
- // Intrapenalty below deals with situations where the intra and inter
- // error scores are very low (e.g. a plain black frame).
- // We do not have special cases in first pass for 0,0 and nearest etc so
- // all inter modes carry an overhead cost estimate for the mv.
- // When the error score is very low this causes us to pick all or lots of
- // INTRA modes and throw lots of key frames.
- // This penalty adds a cost matching that of a 0,0 mv to the intra case.
- this_error += intrapenalty;
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
- // Accumulate the intra error.
- intra_error += (int64_t)this_error;
+ // Accumulate the intra error.
+ fp_acc_data->intra_error += (int64_t)this_error;
#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- // initialization
- cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
- }
+ if (cpi->use_fp_mb_stats) {
+ // initialization
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ }
#endif
- // Set up limit values for motion vectors to prevent them extending
- // outside the UMV borders.
- x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
- x->mv_limits.col_max =
- ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
-
- // Other than for the first frame do a motion search.
- if ((lc == NULL && cm->current_video_frame > 0) ||
- (lc != NULL && lc->current_video_frame_in_layer > 0)) {
- int tmp_err, motion_error, raw_motion_error;
- // Assume 0,0 motion with no mv overhead.
- MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
- struct buf_2d unscaled_last_source_buf_2d;
-
- xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.col_max =
+ ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+ // Other than for the first frame do a motion search.
+ if ((lc == NULL && cm->current_video_frame > 0) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 0)) {
+ int tmp_err, motion_error, raw_motion_error;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+ struct buf_2d unscaled_last_source_buf_2d;
+
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- motion_error = highbd_get_prediction_error(
- bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
- } else {
- motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &xd->plane[0].pre[0]);
- }
-#else
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
motion_error =
get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+ }
+#else
+ motion_error =
+ get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- // Compute the motion error of the 0,0 motion using the last source
- // frame as the reference. Skip the further motion search on
- // reconstructed frame if this error is small.
- unscaled_last_source_buf_2d.buf =
- cpi->unscaled_last_source->y_buffer + recon_yoffset;
- unscaled_last_source_buf_2d.stride =
- cpi->unscaled_last_source->y_stride;
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is small.
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + recon_yoffset;
+ unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- raw_motion_error = highbd_get_prediction_error(
- bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
- } else {
- raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &unscaled_last_source_buf_2d);
- }
-#else
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ raw_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+ } else {
raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
&unscaled_last_source_buf_2d);
+ }
+#else
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
#endif // CONFIG_VP9_HIGHBITDEPTH
- // TODO(pengchong): Replace the hard-coded threshold
- if (raw_motion_error > 25 || lc != NULL) {
- // Test last reference frame using the previous best mv as the
- // starting point (best reference) for the search.
- first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-
- // If the current best reference mv is not centered on 0,0 then do a
- // 0,0 based search as well.
- if (!is_zero_mv(&best_ref_mv)) {
- tmp_err = INT_MAX;
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
-
- if (tmp_err < motion_error) {
- motion_error = tmp_err;
- mv = tmp_mv;
- }
+ // TODO(pengchong): Replace the hard-coded threshold
+ if (raw_motion_error > 25 || lc != NULL) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if (!is_zero_mv(best_ref_mv)) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
}
+ }
- // Search in an older reference frame.
- if (((lc == NULL && cm->current_video_frame > 1) ||
- (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
- gld_yv12 != NULL) {
- // Assume 0,0 motion with no mv overhead.
- int gf_motion_error;
+ // Search in an older reference frame.
+ if (((lc == NULL && cm->current_video_frame > 1) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
+ gld_yv12 != NULL) {
+ // Assume 0,0 motion with no mv overhead.
+ int gf_motion_error;
- xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+ xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- gf_motion_error = highbd_get_prediction_error(
- bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
- } else {
- gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &xd->plane[0].pre[0]);
- }
-#else
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ gf_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
+ }
+#else
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
#endif // CONFIG_VP9_HIGHBITDEPTH
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
- &gf_motion_error);
-
- if (gf_motion_error < motion_error && gf_motion_error < this_error)
- ++second_ref_count;
-
- // Reset to last frame as reference buffer.
- xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
- xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
- xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
- // In accumulating a score for the older reference frame take the
- // best of the motion predicted score and the intra coded error
- // (just as will be done for) accumulation of "coded_error" for
- // the last frame.
- if (gf_motion_error < this_error)
- sr_coded_error += gf_motion_error;
- else
- sr_coded_error += this_error;
- } else {
- sr_coded_error += motion_error;
- }
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+
+ if (gf_motion_error < motion_error && gf_motion_error < this_error)
+ ++(fp_acc_data->second_ref_count);
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if (gf_motion_error < this_error)
+ fp_acc_data->sr_coded_error += gf_motion_error;
+ else
+ fp_acc_data->sr_coded_error += this_error;
} else {
- sr_coded_error += motion_error;
+ fp_acc_data->sr_coded_error += motion_error;
}
+ } else {
+ fp_acc_data->sr_coded_error += motion_error;
+ }
- // Start by assuming that intra mode is best.
- best_ref_mv.row = 0;
- best_ref_mv.col = 0;
+ // Start by assuming that intra mode is best.
+ best_ref_mv->row = 0;
+ best_ref_mv->col = 0;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // intra prediction statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
+ if (motion_error <= this_error) {
+ vpx_clear_system_state();
+
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+ (this_error < (2 * intrapenalty))) {
+ fp_acc_data->neutral_count += 1.0;
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = 1.0;
+#endif
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+ (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ fp_acc_data->neutral_count +=
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+#if ENABLE_MT_BIT_MATCH
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+#endif
+ }
+
+ mv.row *= 8;
+ mv.col *= 8;
+ this_error = motion_error;
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = mv;
+ xd->mi[0]->tx_size = TX_4X4;
+ xd->mi[0]->ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE;
+ vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+ vp9_encode_sby_pass1(x, bsize);
+ fp_acc_data->sum_mvr += mv.row;
+ fp_acc_data->sum_mvr_abs += abs(mv.row);
+ fp_acc_data->sum_mvc += mv.col;
+ fp_acc_data->sum_mvc_abs += abs(mv.col);
+ fp_acc_data->sum_mvrs += mv.row * mv.row;
+ fp_acc_data->sum_mvcs += mv.col * mv.col;
+ ++(fp_acc_data->intercount);
+
+ *best_ref_mv = mv;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // intra prediction statistics
+ // inter prediction statistics
cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
if (this_error > FPMB_ERROR_LARGE_TH) {
cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
@@ -1055,214 +1202,229 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
}
#endif
- if (motion_error <= this_error) {
- vpx_clear_system_state();
-
- // Keep a count of cases where the inter and intra were very close
- // and very low. This helps with scene cut detection for example in
- // cropped clips with black bars at the sides or top and bottom.
- if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
- (this_error < (2 * intrapenalty))) {
- neutral_count += 1.0;
- // Also track cases where the intra is not much worse than the inter
- // and use this in limiting the GF/arf group length.
- } else if ((this_error > NCOUNT_INTRA_THRESH) &&
- (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
- neutral_count +=
- (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
- }
-
- mv.row *= 8;
- mv.col *= 8;
- this_error = motion_error;
- xd->mi[0]->mode = NEWMV;
- xd->mi[0]->mv[0].as_mv = mv;
- xd->mi[0]->tx_size = TX_4X4;
- xd->mi[0]->ref_frame[0] = LAST_FRAME;
- xd->mi[0]->ref_frame[1] = NONE;
- vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
- vp9_encode_sby_pass1(x, bsize);
- sum_mvr += mv.row;
- sum_mvr_abs += abs(mv.row);
- sum_mvc += mv.col;
- sum_mvc_abs += abs(mv.col);
- sum_mvrs += mv.row * mv.row;
- sum_mvcs += mv.col * mv.col;
- ++intercount;
-
- best_ref_mv = mv;
+ if (!is_zero_mv(&mv)) {
+ ++(fp_acc_data->mvcount);
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // inter prediction statistics
- cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
- if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
+ // check estimated motion direction
+ if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+ // right direction
cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_ERROR_LARGE_MASK;
- } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ FPMB_MOTION_RIGHT_MASK;
+ } else if (mv.as_mv.row < 0 &&
+ abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+ // up direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
+ } else if (mv.as_mv.col < 0 &&
+ abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+ // left direction
cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_ERROR_SMALL_MASK;
+ FPMB_MOTION_LEFT_MASK;
+ } else {
+ // down direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_DOWN_MASK;
}
}
#endif
- if (!is_zero_mv(&mv)) {
- ++mvcount;
+ // Does the row vector point inwards or outwards?
+ if (mb_row < cm->mb_rows / 2) {
+ if (mv.row > 0)
+ --(fp_acc_data->sum_in_vectors);
+ else if (mv.row < 0)
+ ++(fp_acc_data->sum_in_vectors);
+ } else if (mb_row > cm->mb_rows / 2) {
+ if (mv.row > 0)
+ ++(fp_acc_data->sum_in_vectors);
+ else if (mv.row < 0)
+ --(fp_acc_data->sum_in_vectors);
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < cm->mb_cols / 2) {
+ if (mv.col > 0)
+ --(fp_acc_data->sum_in_vectors);
+ else if (mv.col < 0)
+ ++(fp_acc_data->sum_in_vectors);
+ } else if (mb_col > cm->mb_cols / 2) {
+ if (mv.col > 0)
+ ++(fp_acc_data->sum_in_vectors);
+ else if (mv.col < 0)
+ --(fp_acc_data->sum_in_vectors);
+ }
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ } else { // 0,0 mv but high error
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ }
+ } else { // Intra < inter error
+ if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ else
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ }
+ } else {
+ fp_acc_data->sr_coded_error += (int64_t)this_error;
+ }
+ fp_acc_data->coded_error += (int64_t)this_error;
+
+ recon_yoffset += 16;
+ recon_uvoffset += uv_mb_height;
+
+ // Accumulate row level stats to the corresponding tile stats
+ if (cpi->new_mt && mb_col == (tile.mi_col_end >> 1) - 1)
+ accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
+
+ (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
+ num_mb_cols);
+ }
+ vpx_clear_system_state();
+}
+
+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_row;
+ TileDataEnc tile_data;
+ TileInfo *tile = &tile_data.tile_info;
+ MV zero_mv = { 0, 0 };
+ MV best_ref_mv;
+ // Tiling is ignored in the first pass.
+ vp9_tile_init(tile, cm, 0, 0);
+
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ best_ref_mv = zero_mv;
+ vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,
+ &best_ref_mv, mb_row);
+ }
+}
+
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TWO_PASS *twopass = &cpi->twopass;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+ const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+ LAYER_CONTEXT *const lc =
+ is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
+ : NULL;
+ BufferPool *const pool = cm->buffer_pool;
+
+ FIRSTPASS_DATA fp_temp_data;
+ FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;
+
+ vpx_clear_system_state();
+ vp9_zero(fp_temp_data);
+ fp_acc_data->image_data_start_row = INVALID_ROW;
+
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- cpi->twopass.frame_mb_stats_buf[mb_index] &=
- ~FPMB_MOTION_ZERO_MASK;
- // check estimated motion direction
- if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
- // right direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_RIGHT_MASK;
- } else if (mv.as_mv.row < 0 &&
- abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
- // up direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_UP_MASK;
- } else if (mv.as_mv.col < 0 &&
- abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
- // left direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_LEFT_MASK;
- } else {
- // down direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_DOWN_MASK;
- }
- }
+ if (cpi->use_fp_mb_stats) {
+ vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
+ }
#endif
- // Does the row vector point inwards or outwards?
- if (mb_row < cm->mb_rows / 2) {
- if (mv.row > 0)
- --sum_in_vectors;
- else if (mv.row < 0)
- ++sum_in_vectors;
- } else if (mb_row > cm->mb_rows / 2) {
- if (mv.row > 0)
- ++sum_in_vectors;
- else if (mv.row < 0)
- --sum_in_vectors;
- }
+ set_first_pass_params(cpi);
+ vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
- // Does the col vector point inwards or outwards?
- if (mb_col < cm->mb_cols / 2) {
- if (mv.col > 0)
- --sum_in_vectors;
- else if (mv.col < 0)
- ++sum_in_vectors;
- } else if (mb_col > cm->mb_cols / 2) {
- if (mv.col > 0)
- ++sum_in_vectors;
- else if (mv.col < 0)
- --sum_in_vectors;
- }
- frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
- } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
- frame_noise_energy += fp_estimate_block_noise(x, bsize);
- } else { // 0,0 mv but high error
- frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
- }
- } else { // Intra < inter error
- if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
- frame_noise_energy += fp_estimate_block_noise(x, bsize);
- else
- frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
- }
- } else {
- sr_coded_error += (int64_t)this_error;
- }
- coded_error += (int64_t)this_error;
+ if (lc != NULL) {
+ twopass = &lc->twopass;
- // Adjust to the next column of MBs.
- x->plane[0].src.buf += 16;
- x->plane[1].src.buf += uv_mb_height;
- x->plane[2].src.buf += uv_mb_height;
+ cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
- recon_yoffset += 16;
- recon_uvoffset += uv_mb_height;
+ if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+ REF_FRAMES) {
+ cpi->gld_fb_idx =
+ cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+ cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
+ } else {
+ cpi->refresh_golden_frame = 0;
}
- // Adjust to the next row of MBs.
- x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
- x->plane[1].src.buf +=
- uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
- x->plane[2].src.buf +=
- uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+ if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
- vpx_clear_system_state();
- }
+ vp9_scale_references(cpi);
- // Clamp the image start to rows/2. This number of rows is discarded top
- // and bottom as dead data so rows / 2 means the frame is blank.
- if ((image_data_start_row > cm->mb_rows / 2) ||
- (image_data_start_row == INVALID_ROW)) {
- image_data_start_row = cm->mb_rows / 2;
+ // Use either last frame or alt frame for motion search.
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (first_ref_buf == NULL)
+ first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
+ }
+
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ if (gld_yv12 == NULL) {
+ gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ }
+ } else {
+ gld_yv12 = NULL;
+ }
+
+ set_ref_ptrs(cm, xd,
+ (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
+
+ cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+ &cpi->scaled_source, 0);
}
- // Exclude any image dead zone
- if (image_data_start_row > 0) {
- intra_skip_count =
- VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+
+ vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+ vp9_setup_src_planes(x, cpi->Source, 0, 0);
+ vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
+
+ if (!frame_is_intra_only(cm)) {
+ vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
}
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+ vp9_frame_init_quantizer(cpi);
+
+ x->skip_recode = 0;
+
+ vp9_init_mv_probs(cm);
+ vp9_initialize_rd_consts(cpi);
+
+ cm->log2_tile_rows = 0;
+
{
FIRSTPASS_STATS fps;
- // The minimum error here insures some bit allocation to frames even
- // in static regions. The allocation per MB declines for larger formats
- // where the typical "real" energy per MB also falls.
- // Initial estimate here uses sqrt(mbs) to define the min_err, where the
- // number of mbs is proportional to the image area.
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
- ? cpi->initial_mbs
- : cpi->common.MBs;
- const double min_err = 200 * sqrt(num_mbs);
-
- intra_factor = intra_factor / (double)num_mbs;
- brightness_factor = brightness_factor / (double)num_mbs;
- fps.weight = intra_factor * brightness_factor;
-
- fps.frame = cm->current_video_frame;
- fps.spatial_layer_id = cpi->svc.spatial_layer_id;
- fps.coded_error = (double)(coded_error >> 8) + min_err;
- fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
- fps.intra_error = (double)(intra_error >> 8) + min_err;
- fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs;
- fps.count = 1.0;
- fps.pcnt_inter = (double)intercount / num_mbs;
- fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
- fps.pcnt_neutral = (double)neutral_count / num_mbs;
- fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
- fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
- fps.inactive_zone_rows = (double)image_data_start_row;
- // Currently set to 0 as most issues relate to letter boxing.
- fps.inactive_zone_cols = (double)0;
-
- if (mvcount > 0) {
- fps.MVr = (double)sum_mvr / mvcount;
- fps.mvr_abs = (double)sum_mvr_abs / mvcount;
- fps.MVc = (double)sum_mvc / mvcount;
- fps.mvc_abs = (double)sum_mvc_abs / mvcount;
- fps.MVrv =
- ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
- fps.MVcv =
- ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
- fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
- fps.pcnt_motion = (double)mvcount / num_mbs;
+ TileDataEnc *first_tile_col;
+ if (!cpi->new_mt) {
+ cm->log2_tile_cols = 0;
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+ first_pass_encode(cpi, fp_acc_data);
+ first_pass_stat_calc(cpi, &fps, fp_acc_data);
} else {
- fps.MVr = 0.0;
- fps.mvr_abs = 0.0;
- fps.MVc = 0.0;
- fps.mvc_abs = 0.0;
- fps.MVrv = 0.0;
- fps.MVcv = 0.0;
- fps.mv_in_out_count = 0.0;
- fps.pcnt_motion = 0.0;
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+#if ENABLE_MT_BIT_MATCH
+ cm->log2_tile_cols = 0;
+ vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
+#endif
+ vp9_encode_fp_row_mt(cpi);
+ first_tile_col = &cpi->tile_data[0];
+#if ENABLE_MT_BIT_MATCH
+ accumulate_floating_point_stats(cpi, first_tile_col);
+#endif
+ first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
}
// Dont allow a value of 0 for duration.
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 5541893dc..ee6d5f360 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,6 +39,40 @@ typedef struct {
} FIRSTPASS_MB_STATS;
#endif
+#define INVALID_ROW -1
+
+#define ENABLE_MT_BIT_MATCH 0
+#if ENABLE_MT_BIT_MATCH
+typedef struct {
+ double frame_mb_intra_factor;
+ double frame_mb_brightness_factor;
+ double frame_mb_neutral_count;
+} FP_MB_FLOAT_STATS;
+#endif
+
+typedef struct {
+ double intra_factor;
+ double brightness_factor;
+ int64_t coded_error;
+ int64_t sr_coded_error;
+ int64_t frame_noise_energy;
+ int64_t intra_error;
+ int intercount;
+ int second_ref_count;
+ double neutral_count;
+ int intra_skip_count;
+ int image_data_start_row;
+ int mvcount;
+ int sum_mvr;
+ int sum_mvr_abs;
+ int sum_mvc;
+ int sum_mvc_abs;
+ int64_t sum_mvrs;
+ int64_t sum_mvcs;
+ int sum_in_vectors;
+ int intra_smooth_count;
+} FIRSTPASS_DATA;
+
typedef struct {
double frame;
double weight;
@@ -114,6 +148,11 @@ typedef struct {
uint8_t *this_frame_mb_stats;
FIRSTPASS_MB_STATS firstpass_mb_stats;
#endif
+
+#if ENABLE_MT_BIT_MATCH
+ FP_MB_FLOAT_STATS *fp_mb_float_stats;
+#endif
+
// An indication of the content type of the current frame
FRAME_CONTENT_TYPE fr_content_type;
@@ -141,12 +180,20 @@ typedef struct {
} TWO_PASS;
struct VP9_COMP;
+struct ThreadData;
+struct TileDataEnc;
void vp9_init_first_pass(struct VP9_COMP *cpi);
void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
void vp9_end_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
+ struct ThreadData *td,
+ FIRSTPASS_DATA *fp_acc_data,
+ struct TileDataEnc *tile_data,
+ MV *best_ref_mv, int mb_row);
+
void vp9_init_second_pass(struct VP9_COMP *cpi);
void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_job_queue.h b/vp9/encoder/vp9_job_queue.h
new file mode 100644
index 000000000..89c08f207
--- /dev/null
+++ b/vp9/encoder/vp9_job_queue.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VP9_ENCODER_VP9_JOB_QUEUE_H_
+
+typedef enum {
+ FIRST_PASS_JOB,
+ ENCODE_JOB,
+ ARNR_JOB,
+ NUM_JOB_TYPES,
+} JOB_TYPE;
+
+// Encode job parameters
+typedef struct {
+ int vert_unit_row_num; // Index of the vertical unit row
+ int tile_col_id; // tile col id within a tile
+ int tile_row_id; // tile col id within a tile
+} JobNode;
+
+// Job queue element parameters
+typedef struct {
+ // Pointer to the next link in the job queue
+ void *next;
+
+ // Job information context of the module
+ JobNode job_info;
+} JobQueue;
+
+// Job queue handle
+typedef struct {
+ // Pointer to the next link in the job queue
+ void *next;
+
+ // Counter to store the number of jobs picked up for processing
+ int num_jobs_acquired;
+} JobQueueHandle;
+
+#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
new file mode 100644
index 000000000..23b0b4276
--- /dev/null
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+ int tile_id) {
+ RowMTInfo *row_mt_info;
+ JobQueueHandle *job_queue_hdl = NULL;
+ void *next = NULL;
+ JobNode *job_info = NULL;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_handle = NULL;
+#endif
+
+ row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
+ job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ mutex_handle = &row_mt_info->job_mutex;
+#endif
+
+// lock the mutex for queue access
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(mutex_handle);
+#endif
+ next = job_queue_hdl->next;
+ if (NULL != next) {
+ JobQueue *job_queue = (JobQueue *)next;
+ job_info = &job_queue->job_info;
+ // Update the next job in the queue
+ job_queue_hdl->next = job_queue->next;
+ job_queue_hdl->num_jobs_acquired++;
+ }
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(mutex_handle);
+#endif
+
+ return job_info;
+}
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
+ struct VP9Common *cm = &cpi->common;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int tile_row, tile_col;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int jobs_per_tile_col, total_jobs;
+
+ jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
+ // Calculate the total number of jobs
+ total_jobs = jobs_per_tile_col * tile_cols;
+
+ multi_thread_ctxt->allocated_tile_cols = tile_cols;
+ multi_thread_ctxt->allocated_tile_rows = tile_rows;
+ multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
+
+ multi_thread_ctxt->job_queue =
+ (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+
+#if CONFIG_MULTITHREAD
+ // Create mutex for each tile
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+ pthread_mutex_init(&row_mt_info->job_mutex, NULL);
+ }
+#endif
+
+ // Allocate memory for row based multi-threading
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+ vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+ }
+
+ // Assign the sync pointer of tile row zero for every tile row > 0
+ for (tile_row = 1; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
+ this_tile->row_mt_sync = this_col_tile->row_mt_sync;
+ }
+ }
+
+ // Calculate the number of vertical units in the given tile row
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
+ TileInfo *tile_info = &this_tile->tile_info;
+ multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
+ get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+ }
+}
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int tile_col;
+
+ // Deallocate memory for job queue
+ if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
+
+#if CONFIG_MULTITHREAD
+ // Destroy mutex for each tile
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+ if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);
+ }
+#endif
+
+ // Free row based multi-threading sync memory
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+ vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+ }
+}
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int i;
+
+ for (i = 0; i < tile_cols; i++) {
+ TileDataEnc *this_tile = &cpi->tile_data[i];
+ int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
+
+ // Initialize cur_col to -1 for all rows.
+ memset(this_tile->row_mt_sync.cur_col, -1,
+ sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
+ vp9_zero(this_tile->fp_data);
+ this_tile->fp_data.image_data_start_row = INVALID_ROW;
+ }
+}
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+ int tile_cols, int num_workers) {
+ int tile_id = 0;
+ int i;
+
+ // Allocating the threads for the tiles
+ for (i = 0; i < num_workers; i++) {
+ multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+ if (tile_id == tile_cols) tile_id = 0;
+ }
+}
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+ int cur_tile_id) {
+ RowMTInfo *row_mt_info;
+ JobQueueHandle *job_queue_hndl;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex;
+#endif
+ int num_jobs_remaining;
+
+ row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
+ job_queue_hndl = &row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ mutex = &row_mt_info->job_mutex;
+#endif
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(mutex);
+#endif
+ num_jobs_remaining =
+ multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(mutex);
+#endif
+
+ return (num_jobs_remaining);
+}
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
+ VP9_COMMON *const cm = &cpi->common;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ JobQueue *job_queue = multi_thread_ctxt->job_queue;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int tile_col, i;
+
+ jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;
+ total_jobs = jobs_per_tile_col * tile_cols;
+
+ multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
+ // memset the entire job queue buffer to zero
+ memset(job_queue, 0, total_jobs * sizeof(JobQueue));
+
+ // Job queue preparation
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
+ JobQueue *job_queue_curr, *job_queue_temp;
+ int tile_row = 0;
+
+ tile_ctxt->job_queue_hdl.next = (void *)job_queue;
+ tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
+
+ job_queue_curr = job_queue;
+ job_queue_temp = job_queue;
+
+ // loop over all the vertical rows
+ for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
+ job_row_num++, jobs_per_tile++) {
+ job_queue_curr->job_info.vert_unit_row_num = job_row_num;
+ job_queue_curr->job_info.tile_col_id = tile_col;
+ job_queue_curr->job_info.tile_row_id = tile_row;
+ job_queue_curr->next = (void *)(job_queue_temp + 1);
+ job_queue_curr = ++job_queue_temp;
+
+ if (ENCODE_JOB == job_type) {
+ if (jobs_per_tile >=
+ multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
+ tile_row++;
+ jobs_per_tile = -1;
+ }
+ }
+ }
+
+ // Set the last pointer to NULL
+ job_queue_curr += -1;
+ job_queue_curr->next = (void *)NULL;
+
+ // Move to the next tile
+ job_queue += jobs_per_tile_col;
+ }
+
+ for (i = 0; i < cpi->num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+ thread_data->thread_id = i;
+
+ for (tile_col = 0; tile_col < tile_cols; tile_col++)
+ thread_data->tile_completion_status[tile_col] = 0;
+ }
+}
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+ int *tile_completion_status, int *cur_tile_id,
+ int tile_cols) {
+ int tile_col;
+ int tile_id = -1; // Stores the tile ID with minimum proc done
+ int max_num_jobs_remaining = 0;
+ int num_jobs_remaining;
+
+ // Mark the completion to avoid check in the loop
+ tile_completion_status[*cur_tile_id] = 1;
+ // Check for the status of all the tiles
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ if (tile_completion_status[tile_col] == 0) {
+ num_jobs_remaining =
+ vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
+ // Mark the completion to avoid checks during future switches across tiles
+ if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
+ if (num_jobs_remaining > max_num_jobs_remaining) {
+ max_num_jobs_remaining = num_jobs_remaining;
+ tile_id = tile_col;
+ }
+ }
+ }
+
+ if (-1 == tile_id) {
+ return 1;
+ } else {
+ // Update the cur ID to the next tile ID that will be processed,
+ // which will be the least processed tile
+ *cur_tile_id = tile_id;
+ return 0;
+ }
+}
diff --git a/vp9/encoder/vp9_multi_thread.h b/vp9/encoder/vp9_multi_thread.h
new file mode 100644
index 000000000..bfc0c0ae4
--- /dev/null
+++ b/vp9/encoder/vp9_multi_thread.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H
+#define VP9_ENCODER_VP9_MULTI_THREAD_H
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_job_queue.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+ int tile_id);
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+ int cur_tile_id);
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+ int tile_cols, int num_workers);
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+ int *tile_completion_status, int *cur_tile_id,
+ int tile_cols);
+
+#endif // VP9_ENCODER_VP9_MULTI_THREAD_H
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 1d5ed7c50..c239ca6bc 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -51,6 +51,7 @@ struct vp9_extracfg {
vpx_color_range_t color_range;
int render_width;
int render_height;
+ unsigned int new_mt;
};
static struct vp9_extracfg default_extra_cfg = {
@@ -82,6 +83,7 @@ static struct vp9_extracfg default_extra_cfg = {
0, // color range
0, // render width
0, // render height
+ 1, // new_mt
};
struct vpx_codec_alg_priv {
@@ -245,6 +247,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
"kf_min_dist not supported in auto mode, use 0 "
"or kf_max_dist instead.");
+ RANGE_CHECK(extra_cfg, new_mt, 0, 1);
RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -554,6 +557,8 @@ static vpx_codec_err_t set_encoder_config(
oxcf->target_level = extra_cfg->target_level;
+ oxcf->new_mt = extra_cfg->new_mt;
+
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
#if CONFIG_SPATIAL_SVC
oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -842,6 +847,13 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static vpx_codec_err_t ctrl_set_new_mt(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.new_mt = CAST(VP9E_SET_NEW_MT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
int *const arg = va_arg(args, int *);
if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
@@ -1594,6 +1606,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
{ VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
{ VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
+ { VP9E_SET_NEW_MT, ctrl_set_new_mt },
// Getters
{ VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a8ca0d593..87d9a775b 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -39,9 +39,12 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
VP9_CX_SRCS-yes += encoder/vp9_extend.h
VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h
VP9_CX_SRCS-yes += encoder/vp9_encoder.h
VP9_CX_SRCS-yes += encoder/vp9_quantize.h
VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index cc90159bc..a04d7dd66 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -547,6 +547,14 @@ enum vp8e_enc_control_id {
*/
VP9E_SET_TARGET_LEVEL,
+ /*!\brief Codec control function to set row level multi-threading.
+ *
+ * 0 : off, 1 : on
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_SET_NEW_MT,
+
/*!\brief Codec control function to get bitstream level.
*
* Supported in codecs: VP9
@@ -838,6 +846,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_SET_NEW_MT, unsigned int)
+#define VPX_CTRL_VP9E_SET_NEW_MT
+
VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
#define VPX_CTRL_VP9E_GET_LEVEL
diff --git a/vpxenc.c b/vpxenc.c
index 9cd10ab2e..abb3baebd 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -470,6 +470,9 @@ static const arg_def_t target_level = ARG_DEF(
NULL, "target-level", 1,
"Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
" 11: level 1.1; ... 62: level 6.2)");
+
+static const arg_def_t new_mt =
+ ARG_DEF(NULL, "new-mt", 1, "Enable row based multi-threading in VP9");
#endif
#if CONFIG_VP9_ENCODER
@@ -498,6 +501,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
&min_gf_interval,
&max_gf_interval,
&target_level,
+ &new_mt,
#if CONFIG_VP9_HIGHBITDEPTH
&bitdeptharg,
&inbitdeptharg,
@@ -528,6 +532,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
VP9E_SET_MIN_GF_INTERVAL,
VP9E_SET_MAX_GF_INTERVAL,
VP9E_SET_TARGET_LEVEL,
+ VP9E_SET_NEW_MT,
0 };
#endif