summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_blockd.h2
-rw-r--r--vp9/common/vp9_enums.h2
-rw-r--r--vp9/common/vp9_rtcd_defs.pl1
-rw-r--r--vp9/common/vp9_thread_common.c6
-rw-r--r--vp9/common/vp9_thread_common.h2
-rw-r--r--vp9/decoder/vp9_decodeframe.c643
-rw-r--r--vp9/decoder/vp9_decoder.c61
-rw-r--r--vp9/decoder/vp9_decoder.h36
-rw-r--r--vp9/decoder/vp9_job_queue.c124
-rw-r--r--vp9/decoder/vp9_job_queue.h45
-rw-r--r--vp9/encoder/vp9_context_tree.h3
-rw-r--r--vp9/encoder/vp9_denoiser.c2
-rw-r--r--vp9/encoder/vp9_encodeframe.c99
-rw-r--r--vp9/encoder/vp9_encoder.h6
-rw-r--r--vp9/encoder/vp9_firstpass.c17
-rw-r--r--vp9/encoder/vp9_mcomp.c18
-rw-r--r--vp9/encoder/vp9_mcomp.h3
-rw-r--r--vp9/encoder/vp9_pickmode.c23
-rw-r--r--vp9/encoder/vp9_rdopt.c2
-rw-r--r--vp9/encoder/vp9_temporal_filter.c19
-rw-r--r--vp9/encoder/vp9_temporal_filter.h2
-rw-r--r--vp9/encoder/x86/temporal_filter_constants.h232
-rw-r--r--vp9/encoder/x86/temporal_filter_sse4.c916
-rw-r--r--vp9/vp9_common.mk3
-rw-r--r--vp9/vp9cx.mk5
-rw-r--r--vp9/vp9dx.mk2
26 files changed, 2044 insertions, 230 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f0887157e..e07a9f2d3 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -176,7 +176,7 @@ typedef struct macroblockd {
FRAME_CONTEXT *fc;
/* pointers to reference frames */
- RefBuffer *block_refs[2];
+ const RefBuffer *block_refs[2];
/* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index bc665534d..b33a3a297 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE {
MAX_PROFILES
} BITSTREAM_PROFILE;
+typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG;
+
#define BLOCK_4X4 0
#define BLOCK_4X8 1
#define BLOCK_8X4 2
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 3102b08a7..7e5e3c92c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -188,6 +188,7 @@ add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned i
specialize qw/vp9_temporal_filter_apply sse4_1/;
add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+specialize qw/vp9_apply_temporal_filter sse4_1/;
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index b008ed5cf..00882a5f9 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -475,6 +475,12 @@ void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
#endif // CONFIG_MULTITHREAD
}
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+ thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+ lf_data->start, lf_data->stop, lf_data->y_only,
+ lf_sync);
+}
+
// Accumulate frame counts.
void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
const FRAME_COUNTS *counts, int is_dec) {
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index b97e9ee13..1a2d79abd 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -70,7 +70,7 @@ void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync);
void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
int corrupted);
-void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync);
void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
const struct FRAME_COUNTS *counts, int is_dec);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index c9c85053d..c3bca3479 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -42,6 +42,7 @@
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decoder.h"
#include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_job_queue.h"
#define MAX_VP9_HEADER_SIZE 80
@@ -1027,7 +1028,6 @@ static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
VP9_COMMON *const cm = &pbi->common;
- const int less8x8 = bsize < BLOCK_8X8;
const int bw = 1 << (bwl - 1);
const int bh = 1 << (bhl - 1);
const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
@@ -1059,7 +1059,7 @@ static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
const int eobtotal =
predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
- if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter
+ if (bsize >= BLOCK_8X8 && eobtotal == 0) mi->skip = 1; // skip loopfilter
}
}
@@ -1172,9 +1172,10 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
}
-static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- int n4x4_l2) {
+static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int n4x4_l2, int parse_recon_flag,
+ process_block_fn_t process_block) {
VP9_COMMON *const cm = &pbi->common;
const int n8x8_l2 = n4x4_l2 - 1;
const int num_8x8_wh = 1 << n8x8_l2;
@@ -1187,60 +1188,10 @@ static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi,
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
- partition = *xd->partition;
- xd->partition++;
-
- subsize = get_subsize(bsize, partition);
- if (!hbs) {
- // calculate bmode block dimensions (log 2)
- xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
- xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
- recon_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
- } else {
- switch (partition) {
- case PARTITION_NONE:
- recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
- break;
- case PARTITION_HORZ:
- recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
- if (has_rows)
- recon_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
- n8x8_l2);
- break;
- case PARTITION_VERT:
- recon_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
- if (has_cols)
- recon_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
- n4x4_l2);
- break;
- case PARTITION_SPLIT:
- recon_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
- recon_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
- recon_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
- recon_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2);
- break;
- default: assert(0 && "Invalid partition type");
- }
+ if (parse_recon_flag & PARSE) {
+ *xd->partition =
+ read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
}
-}
-
-static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- int n4x4_l2) {
- VP9_COMMON *const cm = &pbi->common;
- const int n8x8_l2 = n4x4_l2 - 1;
- const int num_8x8_wh = 1 << n8x8_l2;
- const int hbs = num_8x8_wh >> 1;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize;
- const int has_rows = (mi_row + hbs) < cm->mi_rows;
- const int has_cols = (mi_col + hbs) < cm->mi_cols;
- MACROBLOCKD *const xd = &twd->xd;
-
- if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
- *xd->partition =
- read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
partition = *xd->partition;
xd->partition++;
@@ -1250,38 +1201,44 @@ static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi,
// calculate bmode block dimensions (log 2)
xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
- parse_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
+ process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
} else {
switch (partition) {
case PARTITION_NONE:
- parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
+ process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
break;
case PARTITION_HORZ:
- parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
+ process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
if (has_rows)
- parse_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
- n8x8_l2);
+ process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
+ n8x8_l2);
break;
case PARTITION_VERT:
- parse_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
+ process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
if (has_cols)
- parse_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
- n4x4_l2);
+ process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+ n4x4_l2);
break;
case PARTITION_SPLIT:
- parse_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
- parse_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
- parse_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
- parse_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2);
+ process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2,
+ parse_recon_flag, process_block);
+ process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+ parse_recon_flag, process_block);
+ process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2,
+ parse_recon_flag, process_block);
+ process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
+ n8x8_l2, parse_recon_flag, process_block);
break;
default: assert(0 && "Invalid partition type");
}
}
- // update partition context
- if (bsize >= BLOCK_8X8 &&
- (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
- dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+ if (parse_recon_flag & PARSE) {
+ // update partition context
+ if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) &&
+ bsize >= BLOCK_8X8)
+ dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+ }
}
static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1688,6 +1645,317 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data,
}
}
+static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+ int sync_idx) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+ row_mt_worker_data->recon_map[map_idx] = 1;
+ pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]);
+ pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+#else
+ (void)row_mt_worker_data;
+ (void)map_idx;
+ (void)sync_idx;
+#endif // CONFIG_MULTITHREAD
+}
+
+static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+ int sync_idx) {
+#if CONFIG_MULTITHREAD
+ volatile int8_t *map = row_mt_worker_data->recon_map + map_idx;
+ pthread_mutex_t *const mutex =
+ &row_mt_worker_data->recon_sync_mutex[sync_idx];
+ pthread_mutex_lock(mutex);
+ while (!(*map)) {
+ pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+#else
+ (void)row_mt_worker_data;
+ (void)map_idx;
+ (void)sync_idx;
+#endif // CONFIG_MULTITHREAD
+}
+
+static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) {
+ int return_val = 0;
+#if CONFIG_MULTITHREAD
+ int corrupted;
+ pthread_mutex_lock(&lf_sync->lf_mutex);
+ corrupted = lf_sync->corrupted;
+ pthread_mutex_unlock(&lf_sync->lf_mutex);
+ if (!corrupted) {
+ pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+ lf_sync->num_tiles_done[row] += 1;
+ if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1;
+ pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+ }
+#else
+ (void)lf_sync;
+ (void)row;
+ (void)num_tile_cols;
+#endif
+ return return_val;
+}
+
+static void vp9_tile_done(VP9Decoder *pbi) {
+#if CONFIG_MULTITHREAD
+ int terminate;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ const int all_parse_done = 1 << pbi->common.log2_tile_cols;
+ pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex);
+ row_mt_worker_data->num_tiles_done++;
+ terminate = all_parse_done == row_mt_worker_data->num_tiles_done;
+ pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex);
+ if (terminate) {
+ vp9_jobq_terminate(&row_mt_worker_data->jobq);
+ }
+#else
+ (void)pbi;
+#endif
+}
+
+static void vp9_jobq_alloc(VP9Decoder *pbi) {
+ VP9_COMMON *const cm = &pbi->common;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+ const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job);
+
+ if (jobq_size > row_mt_worker_data->jobq_size) {
+ vpx_free(row_mt_worker_data->jobq_buf);
+ CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+ vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
+ jobq_size);
+ row_mt_worker_data->jobq_size = jobq_size;
+ }
+}
+
+static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+ int mi_row, int is_last_row, VP9LfSync *lf_sync,
+ int cur_tile_col) {
+ VP9_COMMON *const cm = &pbi->common;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+ const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ int mi_col_start = tile_data->xd.tile.mi_col_start;
+ int mi_col_end = tile_data->xd.tile.mi_col_end;
+ int mi_col;
+
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+ const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+ int plane;
+ const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+
+ // Top Dependency
+ if (cur_sb_row) {
+ map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c,
+ ((cur_sb_row - 1) * tile_cols) + cur_tile_col);
+ }
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ tile_data->xd.plane[plane].eob =
+ row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+ tile_data->xd.plane[plane].dqcoeff =
+ row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+ }
+ tile_data->xd.partition =
+ row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB);
+ process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON,
+ recon_block);
+ if (cm->lf.filter_level && !cm->skip_loop_filter) {
+ // Queue LPF_JOB
+ int is_lpf_job_ready = 0;
+
+ if (mi_col + MI_BLOCK_SIZE >= mi_col_end) {
+ // Checks if this row has been decoded in all tiles
+ is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols);
+
+ if (is_lpf_job_ready) {
+ Job lpf_job;
+ lpf_job.job_type = LPF_JOB;
+ if (cur_sb_row > 0) {
+ lpf_job.row_num = mi_row - MI_BLOCK_SIZE;
+ vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+ sizeof(lpf_job));
+ }
+ if (is_last_row) {
+ lpf_job.row_num = mi_row;
+ vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+ sizeof(lpf_job));
+ }
+ }
+ }
+ }
+ map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+ (cur_sb_row * tile_cols) + cur_tile_col);
+ }
+}
+
+static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+ int mi_row, int cur_tile_col, uint8_t **data_end) {
+ int mi_col;
+ VP9_COMMON *const cm = &pbi->common;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ TileInfo *tile = &tile_data->xd.tile;
+ TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col];
+ const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+ vp9_zero(tile_data->dqcoeff);
+ vp9_tile_init(tile, cm, 0, cur_tile_col);
+
+ /* Update reader only at the beginning of each row in a tile */
+ if (mi_row == 0) {
+ setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info,
+ &tile_data->bit_reader, pbi->decrypt_cb,
+ pbi->decrypt_state);
+ }
+ vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+ tile_data->xd.error_info = &tile_data->error_info;
+
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+ int plane;
+ const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ tile_data->xd.plane[plane].eob =
+ row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+ tile_data->xd.plane[plane].dqcoeff =
+ row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+ }
+ tile_data->xd.partition =
+ row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB;
+ process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE,
+ parse_block);
+ }
+}
+
+static int row_decode_worker_hook(ThreadData *const thread_data,
+ uint8_t **data_end) {
+ VP9Decoder *const pbi = thread_data->pbi;
+ VP9_COMMON *const cm = &pbi->common;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+ const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ Job job;
+ LFWorkerData *lf_data = thread_data->lf_data;
+ VP9LfSync *lf_sync = thread_data->lf_sync;
+ volatile int corrupted = 0;
+
+ while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) {
+ int mi_col;
+ const int mi_row = job.row_num;
+
+ if (job.job_type == LPF_JOB) {
+ lf_data->start = mi_row;
+ lf_data->stop = lf_data->start + MI_BLOCK_SIZE;
+
+ if (cm->lf.filter_level && !cm->skip_loop_filter &&
+ mi_row < cm->mi_rows) {
+ vp9_loopfilter_job(lf_data, lf_sync);
+ }
+ } else if (job.job_type == RECON_JOB) {
+ const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int is_last_row = sb_rows - 1 == cur_sb_row;
+ TileWorkerData twd_recon;
+ TileWorkerData *const tile_data_recon = &twd_recon;
+ int mi_col_start, mi_col_end;
+
+ tile_data_recon->xd = pbi->mb;
+ vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col);
+ vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff);
+ mi_col_start = tile_data_recon->xd.tile.mi_col_start;
+ mi_col_end = tile_data_recon->xd.tile.mi_col_end;
+
+ if (setjmp(tile_data_recon->error_info.jmp)) {
+ const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+ tile_data_recon->error_info.setjmp = 0;
+ corrupted = 1;
+ for (mi_col = mi_col_start; mi_col < mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+ map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+ (cur_sb_row * tile_cols) + job.tile_col);
+ }
+ if (is_last_row) {
+ vp9_tile_done(pbi);
+ }
+ continue;
+ }
+
+ tile_data_recon->error_info.setjmp = 1;
+ tile_data_recon->xd.error_info = &tile_data_recon->error_info;
+
+ recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync,
+ job.tile_col);
+
+ if (corrupted)
+ vpx_internal_error(&tile_data_recon->error_info,
+ VPX_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (is_last_row) {
+ vp9_tile_done(pbi);
+ }
+ } else if (job.job_type == PARSE_JOB) {
+ TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col];
+
+ if (setjmp(tile_data->error_info.jmp)) {
+ tile_data->error_info.setjmp = 0;
+ corrupted = 1;
+ vp9_tile_done(pbi);
+ continue;
+ }
+
+ tile_data->xd = pbi->mb;
+ tile_data->xd.counts =
+ cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts;
+
+ tile_data->error_info.setjmp = 1;
+
+ parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end);
+
+ corrupted |= tile_data->xd.corrupted;
+ if (corrupted)
+ vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ /* Queue in the recon_job for this row */
+ {
+ Job recon_job;
+ recon_job.row_num = mi_row;
+ recon_job.tile_col = job.tile_col;
+ recon_job.job_type = RECON_JOB;
+ vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job,
+ sizeof(recon_job));
+ }
+
+ /* Queue next parse job */
+ if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) {
+ Job parse_job;
+ parse_job.row_num = mi_row + MI_BLOCK_SIZE;
+ parse_job.tile_col = job.tile_col;
+ parse_job.job_type = PARSE_JOB;
+ vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job,
+ sizeof(parse_job));
+ }
+ }
+ }
+
+ return !corrupted;
+}
+
static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
const uint8_t *data_end) {
VP9_COMMON *const cm = &pbi->common;
@@ -1775,7 +2043,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
row_mt_worker_data->dqcoeff[plane];
}
tile_data->xd.partition = row_mt_worker_data->partition;
- parse_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+ process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+ PARSE, parse_block);
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
@@ -1783,7 +2052,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
row_mt_worker_data->dqcoeff[plane];
}
tile_data->xd.partition = row_mt_worker_data->partition;
- recon_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+ process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+ RECON, recon_block);
} else {
decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
}
@@ -1951,22 +2221,12 @@ static int compare_tile_buffers(const void *a, const void *b) {
return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size);
}
-static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
- const uint8_t *data_end) {
+static INLINE void init_mt(VP9Decoder *pbi) {
+ int n;
VP9_COMMON *const cm = &pbi->common;
- const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
- const uint8_t *bit_reader_end = NULL;
VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
- YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
- const int tile_cols = 1 << cm->log2_tile_cols;
- const int tile_rows = 1 << cm->log2_tile_rows;
- const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
- int n;
-
- assert(tile_cols <= (1 << 6));
- assert(tile_rows == 1);
- (void)tile_rows;
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
if (pbi->num_tile_workers == 0) {
const int num_threads = pbi->max_threads;
@@ -1985,11 +2245,160 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
}
// Initialize LPF
- if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+ if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level &&
+ !cm->skip_loop_filter) {
vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
pbi->num_tile_workers);
}
+ // Note: this memset assumes above_context[0], [1] and [2]
+ // are allocated as part of the same buffer.
+ memset(cm->above_context, 0,
+ sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+
+ memset(cm->above_seg_context, 0,
+ sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+ vp9_reset_lfm(cm);
+}
+
+static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi,
+ const uint8_t *data,
+ const uint8_t *data_end) {
+ VP9_COMMON *const cm = &pbi->common;
+ RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int num_workers = pbi->max_threads;
+ int i, n;
+ int col;
+ int corrupted = 0;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+ VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+
+ assert(tile_cols <= (1 << 6));
+ assert(tile_rows == 1);
+ (void)tile_rows;
+
+ memset(row_mt_worker_data->recon_map, 0,
+ sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map));
+
+ init_mt(pbi);
+
+ // Reset tile decoding hook
+ for (n = 0; n < num_workers; ++n) {
+ VPxWorker *const worker = &pbi->tile_workers[n];
+ ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n];
+ winterface->sync(worker);
+
+ if (cm->lf.filter_level && !cm->skip_loop_filter) {
+ thread_data->lf_sync = lf_row_sync;
+ thread_data->lf_data = &thread_data->lf_sync->lfdata[n];
+ vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm,
+ pbi->mb.plane);
+ }
+
+ thread_data->pbi = pbi;
+
+ worker->hook = (VPxWorkerHook)row_decode_worker_hook;
+ worker->data1 = thread_data;
+ worker->data2 = (void *)&row_mt_worker_data->data_end;
+ }
+
+ for (col = 0; col < tile_cols; ++col) {
+ TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+ tile_data->xd = pbi->mb;
+ tile_data->xd.counts =
+ cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
+ }
+
+ /* Reset the jobq to start of the jobq buffer */
+ vp9_jobq_reset(&row_mt_worker_data->jobq);
+ row_mt_worker_data->num_tiles_done = 0;
+ row_mt_worker_data->data_end = NULL;
+
+ // Load tile data into tile_buffers
+ get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+ &pbi->tile_buffers);
+
+ // Initialize thread frame counts.
+ if (!cm->frame_parallel_decoding_mode) {
+ for (col = 0; col < tile_cols; ++col) {
+ TileWorkerData *const tile_data =
+ (TileWorkerData *)&pbi->tile_worker_data[col];
+ vp9_zero(tile_data->counts);
+ }
+ }
+
+ // queue parse jobs for 0th row of every tile
+ for (col = 0; col < tile_cols; ++col) {
+ Job parse_job;
+ parse_job.row_num = 0;
+ parse_job.tile_col = col;
+ parse_job.job_type = PARSE_JOB;
+ vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job));
+ }
+
+ for (i = 0; i < num_workers; ++i) {
+ VPxWorker *const worker = &pbi->tile_workers[i];
+ worker->had_error = 0;
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ for (; n > 0; --n) {
+ VPxWorker *const worker = &pbi->tile_workers[n - 1];
+ // TODO(jzern): The tile may have specific error data associated with
+ // its vpx_internal_error_info which could be propagated to the main info
+ // in cm. Additionally once the threads have been synced and an error is
+ // detected, there's no point in continuing to decode tiles.
+ corrupted |= !winterface->sync(worker);
+ }
+
+ pbi->mb.corrupted = corrupted;
+
+ {
+ /* Set data end */
+ TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1];
+ row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader);
+ }
+
+ // Accumulate thread frame counts.
+ if (!cm->frame_parallel_decoding_mode) {
+ for (i = 0; i < tile_cols; ++i) {
+ TileWorkerData *const tile_data =
+ (TileWorkerData *)&pbi->tile_worker_data[i];
+ vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
+ }
+ }
+
+ return row_mt_worker_data->data_end;
+}
+
+static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end) {
+ VP9_COMMON *const cm = &pbi->common;
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ const uint8_t *bit_reader_end = NULL;
+ VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
+ int n;
+
+ assert(tile_cols <= (1 << 6));
+ assert(tile_rows == 1);
+ (void)tile_rows;
+
+ init_mt(pbi);
+
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2012,15 +2421,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
worker->data2 = pbi;
}
- // Note: this memset assumes above_context[0], [1] and [2]
- // are allocated as part of the same buffer.
- memset(cm->above_context, 0,
- sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
- memset(cm->above_seg_context, 0,
- sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
- vp9_reset_lfm(cm);
-
// Load tile data into tile_buffers
get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
&pbi->tile_buffers);
@@ -2366,25 +2766,30 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
setup_tile_info(cm, rb);
if (pbi->row_mt == 1) {
int num_sbs = 1;
+ const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+ const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
if (pbi->row_mt_worker_data == NULL) {
CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
+#if CONFIG_MULTITHREAD
+ pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
+#endif
}
if (pbi->max_threads > 1) {
const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
- const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
- const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
num_sbs = sb_cols * sb_rows;
}
if (num_sbs > pbi->row_mt_worker_data->num_sbs) {
vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
- vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs);
+ vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs,
+ pbi->max_threads, sb_rows << cm->log2_tile_cols);
}
+ vp9_jobq_alloc(pbi);
}
sz = vpx_rb_read_literal(rb, 16);
@@ -2544,21 +2949,27 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
pbi->total_tiles = tile_rows * tile_cols;
}
- if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
- // Multi-threaded tile decoder
- *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
- if (!pbi->lpf_mt_opt) {
- if (!xd->corrupted) {
- if (!cm->skip_loop_filter) {
- // If multiple threads are used to decode tiles, then we use those
- // threads to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
- cm->lf.filter_level, 0, 0, pbi->tile_workers,
- pbi->num_tile_workers, &pbi->lf_row_sync);
+ if (pbi->max_threads > 1 && tile_rows == 1 &&
+ (tile_cols > 1 || pbi->row_mt == 1)) {
+ if (pbi->row_mt == 1) {
+ *p_data_end =
+ decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end);
+ } else {
+ // Multi-threaded tile decoder
+ *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+ if (!pbi->lpf_mt_opt) {
+ if (!xd->corrupted) {
+ if (!cm->skip_loop_filter) {
+ // If multiple threads are used to decode tiles, then we use those
+ // threads to do parallel loopfiltering.
+ vp9_loop_filter_frame_mt(
+ new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0,
+ pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
+ }
+ } else {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
}
- } else {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Decode failed. Frame data is corrupted.");
}
}
} else {
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7fde0b07f..0aed3d717 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -56,10 +56,34 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) {
}
void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
- VP9_COMMON *cm, int num_sbs) {
+ VP9_COMMON *cm, int num_sbs, int max_threads,
+ int num_jobs) {
int plane;
const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) *
sizeof(*row_mt_worker_data->dqcoeff[0]);
+ row_mt_worker_data->num_jobs = num_jobs;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+ CHECK_MEM_ERROR(
+ cm, row_mt_worker_data->recon_sync_mutex,
+ vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
+ if (row_mt_worker_data->recon_sync_mutex) {
+ for (i = 0; i < num_jobs; ++i) {
+ pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(
+ cm, row_mt_worker_data->recon_sync_cond,
+ vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
+ if (row_mt_worker_data->recon_sync_cond) {
+ for (i = 0; i < num_jobs; ++i) {
+ pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL);
+ }
+ }
+ }
+#endif
row_mt_worker_data->num_sbs = num_sbs;
for (plane = 0; plane < 3; ++plane) {
CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
@@ -74,11 +98,36 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
sizeof(*row_mt_worker_data->partition)));
CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
+
+ // allocate memory for thread_data
+ if (row_mt_worker_data->thread_data == NULL) {
+ const size_t thread_size =
+ max_threads * sizeof(*row_mt_worker_data->thread_data);
+ CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+ vpx_memalign(32, thread_size));
+ }
}
void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
if (row_mt_worker_data != NULL) {
int plane;
+#if CONFIG_MULTITHREAD
+ int i;
+ if (row_mt_worker_data->recon_sync_mutex != NULL) {
+ for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+ pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]);
+ }
+ vpx_free(row_mt_worker_data->recon_sync_mutex);
+ row_mt_worker_data->recon_sync_mutex = NULL;
+ }
+ if (row_mt_worker_data->recon_sync_cond != NULL) {
+ for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+ pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]);
+ }
+ vpx_free(row_mt_worker_data->recon_sync_cond);
+ row_mt_worker_data->recon_sync_cond = NULL;
+ }
+#endif
for (plane = 0; plane < 3; ++plane) {
vpx_free(row_mt_worker_data->eob[plane]);
row_mt_worker_data->eob[plane] = NULL;
@@ -89,6 +138,8 @@ void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
row_mt_worker_data->partition = NULL;
vpx_free(row_mt_worker_data->recon_map);
row_mt_worker_data->recon_map = NULL;
+ vpx_free(row_mt_worker_data->thread_data);
+ row_mt_worker_data->thread_data = NULL;
}
}
@@ -179,8 +230,16 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
if (pbi->row_mt == 1) {
vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+ if (pbi->row_mt_worker_data != NULL) {
+ vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq);
+ vpx_free(pbi->row_mt_worker_data->jobq_buf);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex);
+#endif
+ }
vpx_free(pbi->row_mt_worker_data);
}
+
vp9_remove_common(&pbi->common);
vpx_free(pbi);
}
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 9a582fffb..4a22aa6b5 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -21,6 +21,7 @@
#include "vp9/common/vp9_thread_common.h"
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
+#include "./vp9_job_queue.h"
#ifdef __cplusplus
extern "C" {
@@ -30,6 +31,14 @@ extern "C" {
#define DQCOEFFS_PER_SB_LOG2 12
#define PARTITIONS_PER_SB 85
+typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType;
+
+typedef struct ThreadData {
+ struct VP9Decoder *pbi;
+ LFWorkerData *lf_data;
+ VP9LfSync *lf_sync;
+} ThreadData;
+
typedef struct TileBuffer {
const uint8_t *data;
size_t size;
@@ -49,14 +58,38 @@ typedef struct TileWorkerData {
struct vpx_internal_error_info error_info;
} TileWorkerData;
+typedef void (*process_block_fn_t)(TileWorkerData *twd,
+ struct VP9Decoder *const pbi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int bwl,
+ int bhl);
+
typedef struct RowMTWorkerData {
int num_sbs;
int *eob[MAX_MB_PLANE];
PARTITION_TYPE *partition;
tran_low_t *dqcoeff[MAX_MB_PLANE];
int8_t *recon_map;
+ const uint8_t *data_end;
+ uint8_t *jobq_buf;
+ JobQueueRowMt jobq;
+ size_t jobq_size;
+ int num_tiles_done;
+ int num_jobs;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t recon_done_mutex;
+ pthread_mutex_t *recon_sync_mutex;
+ pthread_cond_t *recon_sync_cond;
+#endif
+ ThreadData *thread_data;
} RowMTWorkerData;
+/* Structure to queue and dequeue row decode jobs */
+typedef struct Job {
+ int row_num;
+ int tile_col;
+ JobType job_type;
+} Job;
+
typedef struct VP9Decoder {
DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -128,7 +161,8 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
void vp9_decoder_remove(struct VP9Decoder *pbi);
void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
- VP9_COMMON *cm, int num_sbs);
+ VP9_COMMON *cm, int num_sbs, int max_threads,
+ int num_jobs);
void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data);
static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
diff --git a/vp9/decoder/vp9_job_queue.c b/vp9/decoder/vp9_job_queue.c
new file mode 100644
index 000000000..9a31f5a6d
--- /dev/null
+++ b/vp9/decoder/vp9_job_queue.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "vpx/vpx_integer.h"
+
+#include "vp9/decoder/vp9_job_queue.h"
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_init(&jobq->mutex, NULL);
+ pthread_cond_init(&jobq->cond, NULL);
+#endif
+ jobq->buf_base = buf;
+ jobq->buf_wr = buf;
+ jobq->buf_rd = buf;
+ jobq->buf_end = buf + buf_size;
+ jobq->terminate = 0;
+}
+
+void vp9_jobq_reset(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&jobq->mutex);
+#endif
+ jobq->buf_wr = jobq->buf_base;
+ jobq->buf_rd = jobq->buf_base;
+ jobq->terminate = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+void vp9_jobq_deinit(JobQueueRowMt *jobq) {
+ vp9_jobq_reset(jobq);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&jobq->mutex);
+ pthread_cond_destroy(&jobq->cond);
+#endif
+}
+
+void vp9_jobq_terminate(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&jobq->mutex);
+#endif
+ jobq->terminate = 1;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(&jobq->cond);
+ pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) {
+ int ret = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&jobq->mutex);
+#endif
+ if (jobq->buf_end >= jobq->buf_wr + job_size) {
+ memcpy(jobq->buf_wr, job, job_size);
+ jobq->buf_wr = jobq->buf_wr + job_size;
+#if CONFIG_MULTITHREAD
+ pthread_cond_signal(&jobq->cond);
+#endif
+ ret = 0;
+ } else {
+ /* Wrap around case is not supported */
+ assert(0);
+ ret = 1;
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&jobq->mutex);
+#endif
+ return ret;
+}
+
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+ int blocking) {
+ int ret = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&jobq->mutex);
+#endif
+ if (jobq->buf_end >= jobq->buf_rd + job_size) {
+ while (1) {
+ if (jobq->buf_wr >= jobq->buf_rd + job_size) {
+ memcpy(job, jobq->buf_rd, job_size);
+ jobq->buf_rd = jobq->buf_rd + job_size;
+ ret = 0;
+ break;
+ } else {
+ /* If all the entries have been dequeued, then break and return */
+ if (jobq->terminate == 1) {
+ ret = 1;
+ break;
+ }
+ if (blocking == 1) {
+#if CONFIG_MULTITHREAD
+ pthread_cond_wait(&jobq->cond, &jobq->mutex);
+#endif
+ } else {
+ /* If there is no job available,
+ * and this is non blocking call then return fail */
+ ret = 1;
+ break;
+ }
+ }
+ }
+ } else {
+ /* Wrap around case is not supported */
+ ret = 1;
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&jobq->mutex);
+#endif
+
+ return ret;
+}
diff --git a/vp9/decoder/vp9_job_queue.h b/vp9/decoder/vp9_job_queue.h
new file mode 100644
index 000000000..bc23bf9c2
--- /dev/null
+++ b/vp9/decoder/vp9_job_queue.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+
+#include "vpx_util/vpx_thread.h"
+
+typedef struct {
+ // Pointer to buffer base which contains the jobs
+ uint8_t *buf_base;
+
+ // Pointer to current address where new job can be added
+ uint8_t *volatile buf_wr;
+
+ // Pointer to current address from where next job can be obtained
+ uint8_t *volatile buf_rd;
+
+ // Pointer to end of job buffer
+ uint8_t *buf_end;
+
+ int terminate;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+#endif
+} JobQueueRowMt;
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size);
+void vp9_jobq_reset(JobQueueRowMt *jobq);
+void vp9_jobq_deinit(JobQueueRowMt *jobq);
+void vp9_jobq_terminate(JobQueueRowMt *jobq);
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size);
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+ int blocking);
+
+#endif // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index d2cdb1010..4e301cc17 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -91,6 +91,9 @@ typedef struct PC_TREE {
struct PC_TREE *split[4];
PICK_MODE_CONTEXT *leaf_split[4];
};
+ // Obtained from a simple motion search. Used by the ML based partition search
+ // speed feature.
+ MV mv;
} PC_TREE;
void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 2820b71b4..65ce15ff7 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -201,7 +201,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
int i;
struct buf_2d saved_dst[MAX_MB_PLANE];
struct buf_2d saved_pre[MAX_MB_PLANE];
- RefBuffer *saved_block_refs[2];
+ const RefBuffer *saved_block_refs[2];
MV_REFERENCE_FRAME saved_frame;
frame = ctx->best_reference_frame;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5adefac1a..236567f94 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3440,18 +3440,59 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
#undef FEATURES
#undef LABELS
+// Perform fast and coarse motion search for the given block. This is a
+// pre-processing step for the ML based partition search speedup.
+static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ MV ref_mv, MV_REFERENCE_FRAME ref,
+ uint8_t *const pred_buf) {
+ const VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+ const int step_param = 1;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ const SEARCH_METHODS search_method = NSTEP;
+ const int sadpb = x->sadperbit16;
+ MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
+ MV best_mv = { 0, 0 };
+ int cost_list[5];
+
+ assert(yv12 != NULL);
+ if (!yv12) return;
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[ref - 1].sf);
+ mi->ref_frame[0] = ref;
+ mi->ref_frame[1] = NONE;
+ mi->sb_type = bsize;
+ vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+ vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv,
+ &best_mv, 0, 0);
+ best_mv.row *= 8;
+ best_mv.col *= 8;
+ x->mv_limits = tmp_mv_limits;
+ mi->mv[0].as_mv = best_mv;
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ xd->plane[0].dst.buf = pred_buf;
+ xd->plane[0].dst.stride = 64;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+}
+
// Use a neural net model to prune partition-none and partition-split search.
// The model uses prediction residue variance and quantization step size as
// input features.
#define FEATURES 6
-static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
+static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
+ MACROBLOCK *const x,
+ PC_TREE *const pc_tree,
BLOCK_SIZE bsize, int mi_row,
int mi_col, int *none, int *split) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- MODE_INFO *mi = xd->mi[0];
+ const VP9_COMMON *const cm = &cpi->common;
const NN_CONFIG *nn_config = NULL;
#if CONFIG_VP9_HIGHBITDEPTH
+ MACROBLOCKD *xd = &x->e_mbd;
DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
? (CONVERT_TO_BYTEPTR(pred_buffer))
@@ -3489,41 +3530,20 @@ static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
if (!nn_config) return;
- mi->ref_frame[1] = NONE;
- mi->sb_type = bsize;
// Do a simple single motion search to find a prediction for current block.
// The variance of the residue will be used as input features.
{
+ MV ref_mv;
const MV_REFERENCE_FRAME ref =
cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
- YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref);
- MV ref_mv = { 0, 0 };
- MV ref_mv_full = { 0, 0 };
- const int step_param = 1;
- const MvLimits tmp_mv_limits = x->mv_limits;
- const SEARCH_METHODS search_method = NSTEP;
- const int sadpb = x->sadperbit16;
- MV best_mv = { 0, 0 };
- int cost_list[5];
-
- assert(yv12 != NULL);
- if (!yv12) return;
- vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
- &cm->frame_refs[ref - 1].sf);
- mi->ref_frame[0] = ref;
- vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
- vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param,
- search_method, sadpb, cond_cost_list(cpi, cost_list),
- &ref_mv, &best_mv, 0, 0);
- best_mv.row *= 8;
- best_mv.col *= 8;
- x->mv_limits = tmp_mv_limits;
- mi->mv[0].as_mv = best_mv;
-
- set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
- xd->plane[0].dst.buf = pred_buf;
- xd->plane[0].dst.stride = 64;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ // If bsize is 64x64, use zero MV as reference; otherwise, use MV result
+ // of previous(larger) block as reference.
+ if (bsize == BLOCK_64X64)
+ ref_mv.row = ref_mv.col = 0;
+ else
+ ref_mv = pc_tree->mv;
+ simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf);
+ pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv;
}
vpx_clear_system_state();
@@ -3818,14 +3838,19 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
pc_tree->partitioning = PARTITION_NONE;
- if (cpi->sf.ml_var_partition_pruning) {
+ if (cpi->sf.ml_var_partition_pruning && !frame_is_intra_only(cm)) {
const int do_ml_var_partition_pruning =
- !frame_is_intra_only(cm) && partition_none_allowed && do_split &&
+ partition_none_allowed && do_split &&
mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
if (do_ml_var_partition_pruning) {
- ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col,
+ ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
&partition_none_allowed, &do_split);
+ } else {
+ vp9_zero(pc_tree->mv);
+ }
+ if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks.
+ for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
}
}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index a690ebc73..1e1c6b715 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -947,8 +947,8 @@ static INLINE RefCntBuffer *get_ref_cnt_buffer(VP9_COMMON *cm, int fb_idx) {
}
static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
- VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
- VP9_COMMON *const cm = &cpi->common;
+ const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
: NULL;
@@ -1027,7 +1027,7 @@ static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
cpi->oxcf.enable_auto_arf;
}
-static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd,
MV_REFERENCE_FRAME ref0,
MV_REFERENCE_FRAME ref1) {
xd->block_refs[0] =
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8f0da48a2..5cfffe6b5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -549,7 +549,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
}
#define FP_DN_THRESH 8
-#define FP_MAX_DN_THRESH 16
+#define FP_MAX_DN_THRESH 24
#define KERNEL_SIZE 3
// Baseline Kernal weights for first pass noise metric
@@ -843,6 +843,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
double mb_intra_factor;
double mb_brightness_factor;
double mb_neutral_count;
+ int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
// First pass code requires valid last and new frame buffers.
assert(new_yv12 != NULL);
@@ -1254,7 +1255,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
}
}
#endif
-
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
if (mv.row > 0)
@@ -1280,14 +1280,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
else if (mv.col < 0)
--(fp_acc_data->sum_in_vectors);
}
- fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
- } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+ }
+ if (this_intra_error < scaled_low_intra_thresh) {
fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
- } else { // 0,0 mv but high error
+ } else {
fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
}
} else { // Intra < inter error
- int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
if (this_intra_error < scaled_low_intra_thresh) {
fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
if (this_motion_error < scaled_low_intra_thresh) {
@@ -2399,8 +2398,12 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
twopass->arnr_strength_adjustment = 0;
- if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75)))
+ if (section_noise < 150) {
twopass->arnr_strength_adjustment -= 1;
+ if (section_noise < 75) twopass->arnr_strength_adjustment -= 1;
+ } else if (section_noise > 250)
+ twopass->arnr_strength_adjustment += 1;
+
if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1;
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 602cc5798..63f7f9957 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2487,7 +2487,8 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
/* do_refine: If last step (1-away) of n-step search doesn't pick the center
point as the best match, we will do a final 1-away diamond
refining search */
-static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+static int full_pixel_diamond(const VP9_COMP *const cpi,
+ const MACROBLOCK *const x, MV *mvp_full,
int step_param, int sadpb, int further_steps,
int do_refine, int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2549,8 +2550,9 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
// Runs an limited range exhaustive mesh search using a pattern set
// according to the encode speed profile.
-static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
- MV *centre_mv_full, int sadpb, int *cost_list,
+static int full_pixel_exhaustive(const VP9_COMP *const cpi,
+ const MACROBLOCK *const x, MV *centre_mv_full,
+ int sadpb, int *cost_list,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *ref_mv, MV *dst_mv) {
const SPEED_FEATURES *const sf = &cpi->sf;
@@ -2812,13 +2814,13 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
return best_sad;
}
-int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- MV *mvp_full, int step_param, int search_method,
- int error_per_bit, int *cost_list, const MV *ref_mv,
- MV *tmp_mv, int var_max, int rd) {
+int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int search_method, int error_per_bit, int *cost_list,
+ const MV *ref_mv, MV *tmp_mv, int var_max, int rd) {
const SPEED_FEATURES *const sf = &cpi->sf;
const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
- vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
int run_exhaustive_search = 0;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 779e8d8e7..da93c5d44 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -115,7 +115,8 @@ struct VP9_COMP;
// "mvp_full" is the MV search starting point;
// "ref_mv" is the context reference MV;
// "tmp_mv" is the searched best MV.
-int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+int vp9_full_pixel_search(const struct VP9_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
MV *mvp_full, int step_param, int search_method,
int error_per_bit, int *cost_list, const MV *ref_mv,
MV *tmp_mv, int var_max, int rd);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index a3240513f..8cd1e6e31 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1683,6 +1683,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
unsigned int sse_zeromv_normalized = UINT_MAX;
unsigned int best_sse_sofar = UINT_MAX;
int gf_temporal_ref = 0;
+ int force_test_gf_zeromv = 0;
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
int64_t zero_last_cost_orig = INT64_MAX;
@@ -1939,6 +1940,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
flag_svc_subpel = 1;
}
+ // For SVC with quality layers, when QP of lower layer is lower
+ // than current layer: force check of GF-ZEROMV before early exit
+ // due to skip flag.
+ if (svc->spatial_layer_id > 0 && no_scaling &&
+ (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+ cm->base_qindex > svc->lower_layer_qindex + 10)
+ force_test_gf_zeromv = 1;
+
for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
int rate_mv = 0;
int mode_rd_thresh;
@@ -2349,11 +2358,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
}
- if (x->skip) break;
+ if (x->skip &&
+ (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME]))
+ break;
// If early termination flag is 1 and at least 2 modes are checked,
// the mode search is terminated.
- if (best_early_term && idx > 0 && !scene_change_detected) {
+ if (best_early_term && idx > 0 && !scene_change_detected &&
+ (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) {
x->skip = 1;
break;
}
@@ -2396,6 +2408,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
if (best_rdc.rdcost == INT64_MAX ||
+ (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0 &&
+ !x->zero_temp_sad_source) ||
(scene_change_detected && perform_intra_pred) ||
((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
x->content_state_sb == kVeryHighSad) &&
@@ -2438,8 +2452,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
const PREDICTION_MODE this_mode = intra_mode_list[i];
THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
int mode_rd_thresh = rd_threshes[mode_index];
+ // For spatially flat blocks, under short_circuit_flat_blocks flag:
+ // only check DC mode for stationary blocks, otherwise also check
+ // H and V mode.
if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
- this_mode != DC_PRED) {
+ ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) {
continue;
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index c1a079ff0..c73b0ed87 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3034,7 +3034,7 @@ static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
if (content_type == VP9E_CONTENT_FILM) {
if (src_rec_min <= VERY_LOW_VAR_THRESH) {
if (ref_frame == INTRA_FRAME) *this_rd *= 2;
- if (bsize > 6) *this_rd *= 2;
+ if (bsize > BLOCK_16X16) *this_rd *= 2;
}
}
}
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 7d2701355..ee5f0e56c 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -34,6 +34,9 @@
#include "vpx_scale/vpx_scale.h"
static int fixed_divide[512];
+static unsigned int index_mult[14] = {
+ 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
+};
static void temporal_filter_predictors_mb_c(
MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
@@ -184,7 +187,13 @@ void vp9_temporal_filter_init(void) {
static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
int filter_weight) {
- int mod = (sum_dist * 3) / index;
+ int mod;
+
+ assert(index >= 0 && index <= 13);
+ assert(index_mult[index] != 0);
+
+ mod =
+ ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
mod += rounding;
mod >>= strength;
@@ -672,7 +681,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
#endif // CONFIG_VP9_HIGHBITDEPTH
- if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2);
+ if (src_variance <= 2) {
+ strength = VPXMAX(0, arnr_filter_data->strength - 2);
+ }
}
for (frame = 0; frame < frame_count; frame++) {
@@ -768,7 +779,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
count + (BLK_PELS << 1));
} else {
// Apply the filter (YUV)
- vp9_apply_temporal_filter_c(
+ vp9_apply_temporal_filter(
f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
@@ -779,7 +790,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
}
#else
// Apply the filter (YUV)
- vp9_apply_temporal_filter_c(
+ vp9_apply_temporal_filter(
f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h
index f5fa194d1..553a46828 100644
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -24,7 +24,7 @@ static const MV kZeroMv = { 0, 0 };
#define BH_LOG2 5
#define BW 32
#define BW_LOG2 5
-#define BLK_PELS 1024 // Pixels in the block
+#define BLK_PELS ((BH) * (BW)) // Pixels in the block
#define TF_SHIFT 2
#define TF_ROUND 3
#define THR_SHIFT 2
diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/x86/temporal_filter_constants.h
new file mode 100644
index 000000000..20b7085a3
--- /dev/null
+++ b/vp9/encoder/x86/temporal_filter_constants.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#define DIST_STRIDE ((BW) + 2)
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index e5860d39c..b560e2218 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -14,32 +14,9 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-
-// Division using multiplication and shifting. The C implementation does:
-// modifier *= 3;
-// modifier /= index;
-// where 'modifier' is a set of summed values and 'index' is the number of
-// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
-// which may be bound by the edges of the block being filtered.
-//
-// This equation works out to (m * 3) / i which reduces to:
-// m * 3/4
-// m * 1/2
-// m * 1/3
-//
-// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
-// m * C / 65536
-// we can create a C to replicate the division.
-//
-// m * 49152 / 65536 = m * 3/4
-// m * 32758 / 65536 = m * 1/2
-// m * 21846 / 65536 = m * 0.3333
-//
-// These are loaded using an instruction expecting int16_t values but are used
-// with _mm_mulhi_epu16(), which treats them as unsigned.
-#define NEIGHBOR_CONSTANT_4 (int16_t)49152
-#define NEIGHBOR_CONSTANT_6 (int16_t)32768
-#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/x86/temporal_filter_constants.h"
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
@@ -106,12 +83,62 @@ static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
*sum_1 = sum_u16;
}
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+ __m128i dist_first;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+ const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+ __m128i dist_first, dist_second;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_second = _mm_sub_epi16(a_second, b_second);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+ dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+ _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+ *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+ __m128i *reg_second) {
+ read_dist_8(dist, reg_first);
+ read_dist_8(dist + 8, reg_second);
+}
+
// Average the value based on the number of values summed (9 for pixels away
// from the border, 4 for pixels in corners, and 6 for other edge values).
//
// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
// by weight.
-static __m128i average_8(__m128i sum, const __m128i mul_constants,
+static __m128i average_8(__m128i sum, const __m128i *mul_constants,
const int strength, const int rounding,
const int weight) {
// _mm_srl_epi16 uses the lower 64 bit value for the shift.
@@ -121,7 +148,34 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants,
const __m128i sixteen = _mm_set1_epi16(16);
// modifier * 3 / index;
- sum = _mm_mulhi_epu16(sum, mul_constants);
+ sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const int weight_0, const int weight_1) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 =
+ _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
+ weight_1, weight_1);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, *mul_constants);
sum = _mm_adds_epu16(sum, rounding_u16);
sum = _mm_srl_epi16(sum, strength_u128);
@@ -136,20 +190,21 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants,
return _mm_mullo_epi16(sum, weight_u16);
}
-static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
- const __m128i mul_constants_0,
- const __m128i mul_constants_1, const int strength,
- const int rounding, const int weight) {
+static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+ const __m128i *mul_constants_0,
+ const __m128i *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
const __m128i rounding_u16 = _mm_set1_epi16(rounding);
const __m128i weight_u16 = _mm_set1_epi16(weight);
const __m128i sixteen = _mm_set1_epi16(16);
__m128i input_0, input_1;
- input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
+ input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
input_0 = _mm_adds_epu16(input_0, rounding_u16);
- input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
+ input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
input_1 = _mm_adds_epu16(input_1, rounding_u16);
input_0 = _mm_srl_epi16(input_0, strength_u128);
@@ -192,10 +247,10 @@ static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
_mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
}
-static void accumulate_and_store_16(const __m128i sum_0_u16,
- const __m128i sum_1_u16,
- const uint8_t *pred, uint16_t *count,
- uint32_t *accumulator) {
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
const __m128i zero = _mm_setzero_si128();
__m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
@@ -235,6 +290,28 @@ static void accumulate_and_store_16(const __m128i sum_0_u16,
_mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
}
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+ __m128i dist_reg, dist_left, dist_right;
+
+ dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+ dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+ dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+ *sum = _mm_adds_epu16(dist_reg, dist_left);
+ *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+ __m128i *sum_second) {
+ get_sum_8(y_dist, sum_first);
+ get_sum_8(y_dist + 8, sum_second);
+}
+
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
const uint8_t *b, unsigned int width,
unsigned int height, int strength,
@@ -261,7 +338,8 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
sum_8(a, b, &sum_row_a);
sum_8(a + stride, b + width, &sum_row_b);
sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
- sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
+ sum_row_c =
+ average_8(sum_row_c, &mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_c, b, count, accumulator);
a += stride + stride;
@@ -279,7 +357,7 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
sum_row_a =
- average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ average_8(sum_row_a, &mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
a += stride;
@@ -296,7 +374,8 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
- sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
+ sum_row_a =
+ average_8(sum_row_a, &mul_constants, strength, rounding, weight);
accumulate_and_store_8(sum_row_a, b, count, accumulator);
} else { // width == 16
@@ -318,7 +397,7 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
- average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
@@ -343,7 +422,7 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
- average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
+ average_16(&sum_row_a_0, &sum_row_a_1, &mul_constants_0, &mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
@@ -369,8 +448,757 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
- average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
+ average_16(&sum_row_c_0, &sum_row_c_1, &mul_constants_0, &mul_constants_1,
strength, rounding, weight);
accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
}
}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+ const uint16_t *v_dist,
+ __m128i *u_first, __m128i *u_second,
+ __m128i *v_first,
+ __m128i *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizaontal direction, then we
+ // need to load 16 entries from chroma.
+ read_dist_16(u_dist, u_first, u_second);
+ read_dist_16(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ __m128i u_reg, v_reg;
+
+ read_dist_8(u_dist, &u_reg);
+
+ *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+ *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+ read_dist_8(v_dist, &v_reg);
+
+ *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+ *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+ }
+}
+
+// Horizonta add unsigned 16-bit ints in src and store them as signed 32-bit int
+// in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+ const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+ const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+ *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+ int ss_x, int ss_y,
+ __m128i *u_mod,
+ __m128i *v_mod) {
+ __m128i y_reg;
+ if (!ss_x) {
+ read_dist_8(y_dist, &y_reg);
+ if (ss_y == 1) {
+ __m128i y_tmp;
+ read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+ y_reg = _mm_adds_epu16(y_reg, y_tmp);
+ }
+ } else {
+ __m128i y_first, y_second;
+ read_dist_16(y_dist, &y_first, &y_second);
+ if (ss_y == 1) {
+ __m128i y_tmp_0, y_tmp_1;
+ read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+ y_first = _mm_adds_epu16(y_first, y_tmp_0);
+ y_second = _mm_adds_epu16(y_second, y_tmp_1);
+ }
+
+ hadd_epu16(&y_first, &y_first);
+ hadd_epu16(&y_second, &y_second);
+
+ y_reg = _mm_packus_epi32(y_first, y_second);
+ }
+
+ *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+ *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_luma_16(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+ uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
+ const uint16_t *v_dist, const int16_t *const *neighbors_first,
+ const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_first, mul_second;
+
+ __m128i sum_row_1_first, sum_row_1_second;
+ __m128i sum_row_2_first, sum_row_2_second;
+ __m128i sum_row_3_first, sum_row_3_second;
+
+ __m128i u_first, u_second;
+ __m128i v_first, v_second;
+
+ __m128i sum_row_first;
+ __m128i sum_row_second;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(block_width == 16);
+
+ (void)block_width;
+
+ // First row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Add luma values
+ get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+ for (unsigned int h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_apply_temporal_filter_luma(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
+ const uint16_t *u_dist, const uint16_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors_first;
+ const int16_t *const *neighbors_second;
+
+ if (block_width == 16) {
+ // Special Case: The blockwidth is 16 and we are operating on a row of 16
+ // chroma pixels. In this case, we can't use the usualy left-midle-right
+ // pattern. We also don't support splitting now.
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ if (use_whole_blk) {
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ } else {
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+ ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+ ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ }
+
+ // Right
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ vp9_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_chroma_8(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int uv_block_width,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul;
+
+ __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+ __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+ __m128i u_sum_row, v_sum_row;
+
+ (void)uv_block_width;
+
+ // First row
+ mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+ // Add chroma values
+ get_sum_8(u_dist, &u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+ u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+ get_sum_8(v_dist, &v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row =
+ average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ v_sum_row =
+ average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
+
+ for (unsigned int h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+ u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+ v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
+ blk_fw[1]);
+ v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
+ blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row =
+ average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ v_sum_row =
+ average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_apply_temporal_filter_chroma(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ } else {
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ }
+
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ vp9_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_sse4_1(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+ const int *blk_fw_ptr = blk_fw;
+
+ uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference sqaured
+ for (unsigned int row = 0; row < block_height; row++) {
+ for (unsigned int blk_col = 0; blk_col < block_width; blk_col += 16) {
+ store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (unsigned int row = 0; row < chroma_height; row++) {
+ for (unsigned int blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ vp9_apply_temporal_filter_luma(
+ y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
+ u_dist_ptr, v_dist_ptr);
+
+ vp9_apply_temporal_filter_chroma(
+ y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 7ca4004b0..c9a55669e 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -64,9 +64,12 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+endif # !CONFIG_VP9_HIGHBITDEPTH
+
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 05981d689..67e5389a7 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -103,6 +103,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
@@ -137,10 +138,13 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+endif # !CONFIG_VP9_HIGHBITDEPTH
VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c
@@ -149,5 +153,6 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 59f612b94..93a5f368b 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -28,5 +28,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decoder.c
VP9_DX_SRCS-yes += decoder/vp9_decoder.h
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.c
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.h
VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))