diff options
Diffstat (limited to 'vp9')
42 files changed, 2845 insertions, 637 deletions
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c index 057d2e9c0..219ff63cb 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -150,8 +150,9 @@ static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) { return out; } -void vpx_highbd_iadst16_neon(const int32_t *input, int32_t *output, - uint16_t *dest, const int stride, const int bd) { +static void highbd_iadst16_neon(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { const int32x4_t c_1_31_5_27 = create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); const int32x4_t c_9_23_13_19 = @@ -424,11 +425,11 @@ void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, static const highbd_iht_2d IHT_16[] = { { vpx_highbd_idct16x16_256_add_half1d, vpx_highbd_idct16x16_256_add_half1d }, // DCT_DCT = 0 - { vpx_highbd_iadst16_neon, + { highbd_iadst16_neon, vpx_highbd_idct16x16_256_add_half1d }, // ADST_DCT = 1 { vpx_highbd_idct16x16_256_add_half1d, - vpx_highbd_iadst16_neon }, // DCT_ADST = 2 - { vpx_highbd_iadst16_neon, vpx_highbd_iadst16_neon } // ADST_ADST = 3 + highbd_iadst16_neon }, // DCT_ADST = 2 + { highbd_iadst16_neon, highbd_iadst16_neon } // ADST_ADST = 3 }; const highbd_iht_2d ht = IHT_16[tx_type]; int32_t row_output[16 * 16]; diff --git a/vp9/common/mips/msa/vp9_idct16x16_msa.c b/vp9/common/mips/msa/vp9_idct16x16_msa.c index 3e3530116..c03132280 100644 --- a/vp9/common/mips/msa/vp9_idct16x16_msa.c +++ b/vp9/common/mips/msa/vp9_idct16x16_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/vp9/common/mips/msa/vp9_idct4x4_msa.c b/vp9/common/mips/msa/vp9_idct4x4_msa.c index 786fbdb79..aaccd5ca7 100644 --- a/vp9/common/mips/msa/vp9_idct4x4_msa.c +++ b/vp9/common/mips/msa/vp9_idct4x4_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/vp9/common/mips/msa/vp9_idct8x8_msa.c b/vp9/common/mips/msa/vp9_idct8x8_msa.c index e4166775d..76d15ff8c 100644 --- a/vp9/common/mips/msa/vp9_idct8x8_msa.c +++ b/vp9/common/mips/msa/vp9_idct8x8_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/vp9/common/ppc/vp9_idct_vsx.c b/vp9/common/ppc/vp9_idct_vsx.c index 1b2a93edb..e861596ad 100644 --- a/vp9/common/ppc/vp9_idct_vsx.c +++ b/vp9/common/ppc/vp9_idct_vsx.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/ppc/inv_txfm_vsx.h" #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index f0887157e..e07a9f2d3 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -176,7 +176,7 @@ typedef struct macroblockd { FRAME_CONTEXT *fc; /* pointers to reference frames */ - RefBuffer *block_refs[2]; + const RefBuffer *block_refs[2]; /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index bc665534d..b33a3a297 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE { MAX_PROFILES } BITSTREAM_PROFILE; +typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG; + #define BLOCK_4X4 0 #define BLOCK_4X8 1 #define BLOCK_8X4 2 diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8bb68cfdf..00c4414ad 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -183,11 +183,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; +# +# Apply temporal filter +# if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { -add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse4_1/; +add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; +specialize qw/vp9_apply_temporal_filter sse4_1/; + + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count"; + } } + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # ENCODEMB INVOKE diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index b008ed5cf..00882a5f9 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -475,6 +475,12 @@ void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, #endif // CONFIG_MULTITHREAD } +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h index b97e9ee13..1a2d79abd 100644 --- a/vp9/common/vp9_thread_common.h +++ b/vp9/common/vp9_thread_common.h @@ -70,7 +70,7 @@ void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, int corrupted); -void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row); +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync); void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index c9c85053d..41072d5e0 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -42,6 +42,7 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_job_queue.h" #define MAX_VP9_HEADER_SIZE 80 @@ -1027,7 +1028,6 @@ static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { VP9_COMMON *const cm = &pbi->common; - const int less8x8 = bsize < BLOCK_8X8; const int bw = 1 << (bwl - 1); const int bh = 1 << (bhl - 1); const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); @@ -1059,7 +1059,7 @@ static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, const int eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt); - if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter + if (bsize >= BLOCK_8X8 && eobtotal == 0) mi->skip = 1; // skip loopfilter } } @@ -1172,9 +1172,10 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); } -static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi, - int mi_row, int mi_col, BLOCK_SIZE bsize, - int n4x4_l2) { +static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2, int parse_recon_flag, + process_block_fn_t process_block) { VP9_COMMON *const cm = &pbi->common; const int n8x8_l2 = n4x4_l2 - 1; const int num_8x8_wh = 1 << n8x8_l2; @@ -1187,60 +1188,10 @@ static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - partition = *xd->partition; - xd->partition++; - - subsize = get_subsize(bsize, partition); - if (!hbs) { - // calculate bmode block dimensions (log 2) - xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); - xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); - recon_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); - } else { - switch (partition) { - case PARTITION_NONE: - recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); - break; - case PARTITION_HORZ: - recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); - if (has_rows) - recon_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, - n8x8_l2); - break; - case PARTITION_VERT: - recon_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); - if (has_cols) - recon_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, - n4x4_l2); - break; - case PARTITION_SPLIT: - recon_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); - recon_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); - recon_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); - recon_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2); - break; - default: assert(0 && "Invalid partition type"); - } + if (parse_recon_flag & PARSE) { + *xd->partition = + read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); } -} - -static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi, - int mi_row, int mi_col, BLOCK_SIZE bsize, - int n4x4_l2) { - VP9_COMMON *const cm = &pbi->common; - const int n8x8_l2 = n4x4_l2 - 1; - const int num_8x8_wh = 1 << n8x8_l2; - const int hbs = num_8x8_wh >> 1; - PARTITION_TYPE partition; - BLOCK_SIZE subsize; - const int has_rows = (mi_row + hbs) < cm->mi_rows; - const int has_cols = (mi_col + hbs) < cm->mi_cols; - MACROBLOCKD *const xd = &twd->xd; - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - *xd->partition = - read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); partition = *xd->partition; xd->partition++; @@ -1250,38 +1201,44 @@ static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi, // calculate bmode block dimensions (log 2) xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); - parse_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); } else { switch (partition) { case PARTITION_NONE: - parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); break; case PARTITION_HORZ: - parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); if (has_rows) - parse_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, - n8x8_l2); + process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); break; case PARTITION_VERT: - parse_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); if (has_cols) - parse_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, - n4x4_l2); + process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); break; case PARTITION_SPLIT: - parse_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); - parse_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); - parse_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); - parse_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2); + process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, + n8x8_l2, parse_recon_flag, process_block); break; default: assert(0 && "Invalid partition type"); } } - // update partition context - if (bsize >= BLOCK_8X8 && - (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); + if (parse_recon_flag & PARSE) { + // update partition context + if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) && + bsize >= BLOCK_8X8) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); + } } static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, @@ -1688,6 +1645,317 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data, } } +static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); + row_mt_worker_data->recon_map[map_idx] = 1; + pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]); + pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + volatile int8_t *map = row_mt_worker_data->recon_map + map_idx; + pthread_mutex_t *const mutex = + &row_mt_worker_data->recon_sync_mutex[sync_idx]; + pthread_mutex_lock(mutex); + while (!(*map)) { + pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex); + } + pthread_mutex_unlock(mutex); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) { + int return_val = 0; +#if CONFIG_MULTITHREAD + int corrupted; + pthread_mutex_lock(&lf_sync->lf_mutex); + corrupted = lf_sync->corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + if (!corrupted) { + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1; + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); + } +#else + (void)lf_sync; + (void)row; + (void)num_tile_cols; +#endif + return return_val; +} + +static void vp9_tile_done(VP9Decoder *pbi) { +#if CONFIG_MULTITHREAD + int terminate; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int all_parse_done = 1 << pbi->common.log2_tile_cols; + pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex); + row_mt_worker_data->num_tiles_done++; + terminate = all_parse_done == row_mt_worker_data->num_tiles_done; + pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex); + if (terminate) { + vp9_jobq_terminate(&row_mt_worker_data->jobq); + } +#else + (void)pbi; +#endif +} + +static void vp9_jobq_alloc(VP9Decoder *pbi) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job); + + if (jobq_size > row_mt_worker_data->jobq_size) { + vpx_free(row_mt_worker_data->jobq_buf); + CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size)); + vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, + jobq_size); + row_mt_worker_data->jobq_size = jobq_size; + } +} + +static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int is_last_row, VP9LfSync *lf_sync, + int cur_tile_col) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int tile_cols = 1 << cm->log2_tile_cols; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int mi_col_start = tile_data->xd.tile.mi_col_start; + int mi_col_end = tile_data->xd.tile.mi_col_end; + int mi_col; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + + // Top Dependency + if (cur_sb_row) { + map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c, + ((cur_sb_row - 1) * tile_cols) + cur_tile_col); + } + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON, + recon_block); + if (cm->lf.filter_level && !cm->skip_loop_filter) { + // Queue LPF_JOB + int is_lpf_job_ready = 0; + + if (mi_col + MI_BLOCK_SIZE >= mi_col_end) { + // Checks if this row has been decoded in all tiles + is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols); + + if (is_lpf_job_ready) { + Job lpf_job; + lpf_job.job_type = LPF_JOB; + if (cur_sb_row > 0) { + lpf_job.row_num = mi_row - MI_BLOCK_SIZE; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + if (is_last_row) { + lpf_job.row_num = mi_row; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + } + } + } + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + cur_tile_col); + } +} + +static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int cur_tile_col, uint8_t **data_end) { + int mi_col; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + TileInfo *tile = &tile_data->xd.tile; + TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col]; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, cm, 0, cur_tile_col); + + /* Update reader only at the beginning of each row in a tile */ + if (mi_row == 0) { + setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + } + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + tile_data->xd.error_info = &tile_data->error_info; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE, + parse_block); + } +} + +static int row_decode_worker_hook(ThreadData *const thread_data, + uint8_t **data_end) { + VP9Decoder *const pbi = thread_data->pbi; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + Job job; + LFWorkerData *lf_data = thread_data->lf_data; + VP9LfSync *lf_sync = thread_data->lf_sync; + volatile int corrupted = 0; + + while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) { + int mi_col; + const int mi_row = job.row_num; + + if (job.job_type == LPF_JOB) { + lf_data->start = mi_row; + lf_data->stop = lf_data->start + MI_BLOCK_SIZE; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + mi_row < cm->mi_rows) { + vp9_loopfilter_job(lf_data, lf_sync); + } + } else if (job.job_type == RECON_JOB) { + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int is_last_row = sb_rows - 1 == cur_sb_row; + TileWorkerData twd_recon; + TileWorkerData *const tile_data_recon = &twd_recon; + int mi_col_start, mi_col_end; + + tile_data_recon->xd = pbi->mb; + vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col); + vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff); + mi_col_start = tile_data_recon->xd.tile.mi_col_start; + mi_col_end = tile_data_recon->xd.tile.mi_col_end; + + if (setjmp(tile_data_recon->error_info.jmp)) { + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + tile_data_recon->error_info.setjmp = 0; + corrupted = 1; + for (mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + job.tile_col); + } + if (is_last_row) { + vp9_tile_done(pbi); + } + continue; + } + + tile_data_recon->error_info.setjmp = 1; + tile_data_recon->xd.error_info = &tile_data_recon->error_info; + + recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync, + job.tile_col); + + if (corrupted) + vpx_internal_error(&tile_data_recon->error_info, + VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (is_last_row) { + vp9_tile_done(pbi); + } + } else if (job.job_type == PARSE_JOB) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col]; + + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + corrupted = 1; + vp9_tile_done(pbi); + continue; + } + + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts; + + tile_data->error_info.setjmp = 1; + + parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end); + + corrupted |= tile_data->xd.corrupted; + if (corrupted) + vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + /* Queue in the recon_job for this row */ + { + Job recon_job; + recon_job.row_num = mi_row; + recon_job.tile_col = job.tile_col; + recon_job.job_type = RECON_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job, + sizeof(recon_job)); + } + + /* Queue next parse job */ + if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) { + Job parse_job; + parse_job.row_num = mi_row + MI_BLOCK_SIZE; + parse_job.tile_col = job.tile_col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, + sizeof(parse_job)); + } + } + } + + return !corrupted; +} + static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; @@ -1775,7 +2043,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, row_mt_worker_data->dqcoeff[plane]; } tile_data->xd.partition = row_mt_worker_data->partition; - parse_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + PARSE, parse_block); for (plane = 0; plane < MAX_MB_PLANE; ++plane) { tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; @@ -1783,7 +2052,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, row_mt_worker_data->dqcoeff[plane]; } tile_data->xd.partition = row_mt_worker_data->partition; - recon_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + RECON, recon_block); } else { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } @@ -1951,22 +2221,12 @@ static int compare_tile_buffers(const void *a, const void *b) { return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size); } -static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, - const uint8_t *data_end) { +static INLINE void init_mt(VP9Decoder *pbi) { + int n; VP9_COMMON *const cm = &pbi->common; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - const uint8_t *bit_reader_end = NULL; VP9LfSync *lf_row_sync = &pbi->lf_row_sync; - YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - const int num_workers = VPXMIN(pbi->max_threads, tile_cols); - int n; - - assert(tile_cols <= (1 << 6)); - assert(tile_rows == 1); - (void)tile_rows; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); if (pbi->num_tile_workers == 0) { const int num_threads = pbi->max_threads; @@ -1985,11 +2245,160 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, } // Initialize LPF - if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level && + !cm->skip_loop_filter) { vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, pbi->num_tile_workers); } + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + vp9_reset_lfm(cm); +} + +static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = pbi->max_threads; + int i, n; + int col; + int corrupted = 0; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + memset(row_mt_worker_data->recon_map, 0, + sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map)); + + init_mt(pbi); + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n]; + winterface->sync(worker); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + thread_data->lf_sync = lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm, + pbi->mb.plane); + } + + thread_data->pbi = pbi; + + worker->hook = (VPxWorkerHook)row_decode_worker_hook; + worker->data1 = thread_data; + worker->data2 = (void *)&row_mt_worker_data->data_end; + } + + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + } + + /* Reset the jobq to start of the jobq buffer */ + vp9_jobq_reset(&row_mt_worker_data->jobq); + row_mt_worker_data->num_tiles_done = 0; + row_mt_worker_data->data_end = NULL; + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = + (TileWorkerData *)&pbi->tile_worker_data[col]; + vp9_zero(tile_data->counts); + } + } + + // queue parse jobs for 0th row of every tile + for (col = 0; col < tile_cols; ++col) { + Job parse_job; + parse_job.row_num = 0; + parse_job.tile_col = col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job)); + } + + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + worker->had_error = 0; + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + corrupted |= !winterface->sync(worker); + } + + pbi->mb.corrupted = corrupted; + + { + /* Set data end */ + TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1]; + row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader); + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (i = 0; i < tile_cols; ++i) { + TileWorkerData *const tile_data = + (TileWorkerData *)&pbi->tile_worker_data[i]; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + return row_mt_worker_data->data_end; +} + +static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = VPXMIN(pbi->max_threads, tile_cols); + int n; + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + init_mt(pbi); + // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; @@ -2012,15 +2421,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, worker->data2 = pbi; } - // Note: this memset assumes above_context[0], [1] and [2] - // are allocated as part of the same buffer. - memset(cm->above_context, 0, - sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); - memset(cm->above_seg_context, 0, - sizeof(*cm->above_seg_context) * aligned_mi_cols); - - vp9_reset_lfm(cm); - // Load tile data into tile_buffers get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, &pbi->tile_buffers); @@ -2366,25 +2766,32 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_tile_info(cm, rb); if (pbi->row_mt == 1) { int num_sbs = 1; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int num_jobs = sb_rows << cm->log2_tile_cols; if (pbi->row_mt_worker_data == NULL) { CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data, vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); +#endif } if (pbi->max_threads > 1) { const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; - const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); - const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; num_sbs = sb_cols * sb_rows; } - if (num_sbs > pbi->row_mt_worker_data->num_sbs) { + if (num_sbs > pbi->row_mt_worker_data->num_sbs || + num_jobs > pbi->row_mt_worker_data->num_jobs) { vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); - vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs); + vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs, + pbi->max_threads, num_jobs); } + vp9_jobq_alloc(pbi); } sz = vpx_rb_read_literal(rb, 16); @@ -2544,21 +2951,27 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, pbi->total_tiles = tile_rows * tile_cols; } - if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { - // Multi-threaded tile decoder - *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!pbi->lpf_mt_opt) { - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, - cm->lf.filter_level, 0, 0, pbi->tile_workers, - pbi->num_tile_workers, &pbi->lf_row_sync); + if (pbi->max_threads > 1 && tile_rows == 1 && + (tile_cols > 1 || pbi->row_mt == 1)) { + if (pbi->row_mt == 1) { + *p_data_end = + decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end); + } else { + // Multi-threaded tile decoder + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt( + new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0, + pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); } - } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); } } } else { diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 7fde0b07f..0aed3d717 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -56,10 +56,34 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) { } void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, - VP9_COMMON *cm, int num_sbs) { + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs) { int plane; const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) * sizeof(*row_mt_worker_data->dqcoeff[0]); + row_mt_worker_data->num_jobs = num_jobs; +#if CONFIG_MULTITHREAD + { + int i; + CHECK_MEM_ERROR( + cm, row_mt_worker_data->recon_sync_mutex, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); + if (row_mt_worker_data->recon_sync_mutex) { + for (i = 0; i < num_jobs; ++i) { + pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR( + cm, row_mt_worker_data->recon_sync_cond, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); + if (row_mt_worker_data->recon_sync_cond) { + for (i = 0; i < num_jobs; ++i) { + pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL); + } + } + } +#endif row_mt_worker_data->num_sbs = num_sbs; for (plane = 0; plane < 3; ++plane) { CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], @@ -74,11 +98,36 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, sizeof(*row_mt_worker_data->partition))); CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map, vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); + + // allocate memory for thread_data + if (row_mt_worker_data->thread_data == NULL) { + const size_t thread_size = + max_threads * sizeof(*row_mt_worker_data->thread_data); + CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data, + vpx_memalign(32, thread_size)); + } } void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { if (row_mt_worker_data != NULL) { int plane; +#if CONFIG_MULTITHREAD + int i; + if (row_mt_worker_data->recon_sync_mutex != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]); + } + vpx_free(row_mt_worker_data->recon_sync_mutex); + row_mt_worker_data->recon_sync_mutex = NULL; + } + if (row_mt_worker_data->recon_sync_cond != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]); + } + vpx_free(row_mt_worker_data->recon_sync_cond); + row_mt_worker_data->recon_sync_cond = NULL; + } +#endif for (plane = 0; plane < 3; ++plane) { vpx_free(row_mt_worker_data->eob[plane]); row_mt_worker_data->eob[plane] = NULL; @@ -89,6 +138,8 @@ void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { row_mt_worker_data->partition = NULL; vpx_free(row_mt_worker_data->recon_map); row_mt_worker_data->recon_map = NULL; + vpx_free(row_mt_worker_data->thread_data); + row_mt_worker_data->thread_data = NULL; } } @@ -179,8 +230,16 @@ void vp9_decoder_remove(VP9Decoder *pbi) { if (pbi->row_mt == 1) { vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + if (pbi->row_mt_worker_data != NULL) { + vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq); + vpx_free(pbi->row_mt_worker_data->jobq_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex); +#endif + } vpx_free(pbi->row_mt_worker_data); } + vp9_remove_common(&pbi->common); vpx_free(pbi); } diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 9a582fffb..4a22aa6b5 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -21,6 +21,7 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" +#include "./vp9_job_queue.h" #ifdef __cplusplus extern "C" { @@ -30,6 +31,14 @@ extern "C" { #define DQCOEFFS_PER_SB_LOG2 12 #define PARTITIONS_PER_SB 85 +typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType; + +typedef struct ThreadData { + struct VP9Decoder *pbi; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; +} ThreadData; + typedef struct TileBuffer { const uint8_t *data; size_t size; @@ -49,14 +58,38 @@ typedef struct TileWorkerData { struct vpx_internal_error_info error_info; } TileWorkerData; +typedef void (*process_block_fn_t)(TileWorkerData *twd, + struct VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, + int bhl); + typedef struct RowMTWorkerData { int num_sbs; int *eob[MAX_MB_PLANE]; PARTITION_TYPE *partition; tran_low_t *dqcoeff[MAX_MB_PLANE]; int8_t *recon_map; + const uint8_t *data_end; + uint8_t *jobq_buf; + JobQueueRowMt jobq; + size_t jobq_size; + int num_tiles_done; + int num_jobs; +#if CONFIG_MULTITHREAD + pthread_mutex_t recon_done_mutex; + pthread_mutex_t *recon_sync_mutex; + pthread_cond_t *recon_sync_cond; +#endif + ThreadData *thread_data; } RowMTWorkerData; +/* Structure to queue and dequeue row decode jobs */ +typedef struct Job { + int row_num; + int tile_col; + JobType job_type; +} Job; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -128,7 +161,8 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); void vp9_decoder_remove(struct VP9Decoder *pbi); void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, - VP9_COMMON *cm, int num_sbs); + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs); void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data); static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, diff --git a/vp9/decoder/vp9_job_queue.c b/vp9/decoder/vp9_job_queue.c new file mode 100644 index 000000000..9a31f5a6d --- /dev/null +++ b/vp9/decoder/vp9_job_queue.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <string.h> + +#include "vpx/vpx_integer.h" + +#include "vp9/decoder/vp9_job_queue.h" + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) { +#if CONFIG_MULTITHREAD + pthread_mutex_init(&jobq->mutex, NULL); + pthread_cond_init(&jobq->cond, NULL); +#endif + jobq->buf_base = buf; + jobq->buf_wr = buf; + jobq->buf_rd = buf; + jobq->buf_end = buf + buf_size; + jobq->terminate = 0; +} + +void vp9_jobq_reset(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->buf_wr = jobq->buf_base; + jobq->buf_rd = jobq->buf_base; + jobq->terminate = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +void vp9_jobq_deinit(JobQueueRowMt *jobq) { + vp9_jobq_reset(jobq); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&jobq->mutex); + pthread_cond_destroy(&jobq->cond); +#endif +} + +void vp9_jobq_terminate(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->terminate = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(&jobq->cond); + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_wr + job_size) { + memcpy(jobq->buf_wr, job, job_size); + jobq->buf_wr = jobq->buf_wr + job_size; +#if CONFIG_MULTITHREAD + pthread_cond_signal(&jobq->cond); +#endif + ret = 0; + } else { + /* Wrap around case is not supported */ + assert(0); + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + return ret; +} + +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_rd + job_size) { + while (1) { + if (jobq->buf_wr >= jobq->buf_rd + job_size) { + memcpy(job, jobq->buf_rd, job_size); + jobq->buf_rd = jobq->buf_rd + job_size; + ret = 0; + break; + } else { + /* If all the entries have been dequeued, then break and return */ + if (jobq->terminate == 1) { + ret = 1; + break; + } + if (blocking == 1) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(&jobq->cond, &jobq->mutex); +#endif + } else { + /* If there is no job available, + * and this is non blocking call then return fail */ + ret = 1; + break; + } + } + } + } else { + /* Wrap around case is not supported */ + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + + return ret; +} diff --git a/vp9/decoder/vp9_job_queue.h b/vp9/decoder/vp9_job_queue.h new file mode 100644 index 000000000..bc23bf9c2 --- /dev/null +++ b/vp9/decoder/vp9_job_queue.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ + +#include "vpx_util/vpx_thread.h" + +typedef struct { + // Pointer to buffer base which contains the jobs + uint8_t *buf_base; + + // Pointer to current address where new job can be added + uint8_t *volatile buf_wr; + + // Pointer to current address from where next job can be obtained + uint8_t *volatile buf_rd; + + // Pointer to end of job buffer + uint8_t *buf_end; + + int terminate; + +#if CONFIG_MULTITHREAD + pthread_mutex_t mutex; + pthread_cond_t cond; +#endif +} JobQueueRowMt; + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size); +void vp9_jobq_reset(JobQueueRowMt *jobq); +void vp9_jobq_deinit(JobQueueRowMt *jobq); +void vp9_jobq_terminate(JobQueueRowMt *jobq); +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size); +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking); + +#endif // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ diff --git a/vp9/encoder/mips/msa/vp9_error_msa.c b/vp9/encoder/mips/msa/vp9_error_msa.c index 188d04d8f..61786d8f6 100644 --- a/vp9/encoder/mips/msa/vp9_error_msa.c +++ b/vp9/encoder/mips/msa/vp9_error_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -79,6 +80,7 @@ return err; \ } +#if !CONFIG_VP9_HIGHBITDEPTH BLOCK_ERROR_BLOCKSIZE_MSA(16); BLOCK_ERROR_BLOCKSIZE_MSA(64); BLOCK_ERROR_BLOCKSIZE_MSA(256); @@ -103,3 +105,4 @@ int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr, return err; } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c index 0831e5914..efbbe830d 100644 --- a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" diff --git a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c index fa36f09ab..9c5cc12ef 100644 --- a/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c index 604db853c..26d81aa9e 100644 --- a/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c +++ b/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -10,6 +10,7 @@ #include <assert.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index a2a742493..ef8cd46b4 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -479,7 +479,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { double weight_segment_target = 0; double weight_segment = 0; int thresh_low_motion = (cm->width < 720) ? 55 : 20; - int qp_thresh = VPXMIN(20, rc->best_quality << 1); + int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20, + rc->best_quality << 1); cr->apply_cyclic_refresh = 1; if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 || is_lossless_requested(&cpi->oxcf) || diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h index d2cdb1010..4e301cc17 100644 --- a/vp9/encoder/vp9_context_tree.h +++ b/vp9/encoder/vp9_context_tree.h @@ -91,6 +91,9 @@ typedef struct PC_TREE { struct PC_TREE *split[4]; PICK_MODE_CONTEXT *leaf_split[4]; }; + // Obtained from a simple motion search. Used by the ML based partition search + // speed feature. + MV mv; } PC_TREE; void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td); diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 2820b71b4..65ce15ff7 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -201,7 +201,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( int i; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; - RefBuffer *saved_block_refs[2]; + const RefBuffer *saved_block_refs[2]; MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 5adefac1a..236567f94 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3440,18 +3440,59 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, #undef FEATURES #undef LABELS +// Perform fast and coarse motion search for the given block. This is a +// pre-processing step for the ML based partition search speedup. +static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV ref_mv, MV_REFERENCE_FRAME ref, + uint8_t *const pred_buf) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref); + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; + MV best_mv = { 0, 0 }; + int cost_list[5]; + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ref - 1].sf); + mi->ref_frame[0] = ref; + mi->ref_frame[1] = NONE; + mi->sb_type = bsize; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, + &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); +} + // Use a neural net model to prune partition-none and partition-split search. // The model uses prediction residue variance and quantization step size as // input features. #define FEATURES 6 -static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x, +static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, BLOCK_SIZE bsize, int mi_row, int mi_col, int *none, int *split) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi[0]; + const VP9_COMMON *const cm = &cpi->common; const NN_CONFIG *nn_config = NULL; #if CONFIG_VP9_HIGHBITDEPTH + MACROBLOCKD *xd = &x->e_mbd; DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? (CONVERT_TO_BYTEPTR(pred_buffer)) @@ -3489,41 +3530,20 @@ static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x, if (!nn_config) return; - mi->ref_frame[1] = NONE; - mi->sb_type = bsize; // Do a simple single motion search to find a prediction for current block. // The variance of the residue will be used as input features. { + MV ref_mv; const MV_REFERENCE_FRAME ref = cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; - YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref); - MV ref_mv = { 0, 0 }; - MV ref_mv_full = { 0, 0 }; - const int step_param = 1; - const MvLimits tmp_mv_limits = x->mv_limits; - const SEARCH_METHODS search_method = NSTEP; - const int sadpb = x->sadperbit16; - MV best_mv = { 0, 0 }; - int cost_list[5]; - - assert(yv12 != NULL); - if (!yv12) return; - vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, - &cm->frame_refs[ref - 1].sf); - mi->ref_frame[0] = ref; - vp9_set_mv_search_range(&x->mv_limits, &ref_mv); - vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, - search_method, sadpb, cond_cost_list(cpi, cost_list), - &ref_mv, &best_mv, 0, 0); - best_mv.row *= 8; - best_mv.col *= 8; - x->mv_limits = tmp_mv_limits; - mi->mv[0].as_mv = best_mv; - - set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); - xd->plane[0].dst.buf = pred_buf; - xd->plane[0].dst.stride = 64; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + // If bsize is 64x64, use zero MV as reference; otherwise, use MV result + // of previous(larger) block as reference. + if (bsize == BLOCK_64X64) + ref_mv.row = ref_mv.col = 0; + else + ref_mv = pc_tree->mv; + simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf); + pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv; } vpx_clear_system_state(); @@ -3818,14 +3838,19 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->partitioning = PARTITION_NONE; - if (cpi->sf.ml_var_partition_pruning) { + if (cpi->sf.ml_var_partition_pruning && !frame_is_intra_only(cm)) { const int do_ml_var_partition_pruning = - !frame_is_intra_only(cm) && partition_none_allowed && do_split && + partition_none_allowed && do_split && mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; if (do_ml_var_partition_pruning) { - ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col, + ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col, &partition_none_allowed, &do_split); + } else { + vp9_zero(pc_tree->mv); + } + if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. + for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv; } } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index bf35b3570..b8c86ea43 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -29,6 +29,9 @@ #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif @@ -2570,8 +2573,19 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->feature_score_loc_arr); vpx_free(cpi->feature_score_loc_sort); vpx_free(cpi->feature_score_loc_heap); + vpx_free(cpi->select_mv_arr); #endif for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int sqr_bsize; + for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) { + vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]); + } + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + } +#endif vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); cpi->tpl_stats[frame].is_valid = 0; } @@ -5829,31 +5843,6 @@ static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, } } -#if CONFIG_NON_GREEDY_MV -double get_feature_score(uint8_t *buf, ptrdiff_t stride, int rows, int cols) { - double IxIx = 0; - double IxIy = 0; - double IyIy = 0; - double score; - int r, c; - vpx_clear_system_state(); - for (r = 0; r + 1 < rows; ++r) { - for (c = 0; c + 1 < cols; ++c) { - int diff_x = buf[r * stride + c] - buf[r * stride + c + 1]; - int diff_y = buf[r * stride + c] - buf[(r + 1) * stride + c]; - IxIx += diff_x * diff_x; - IxIy += diff_x * diff_y; - IyIy += diff_y * diff_y; - } - } - IxIx /= (rows - 1) * (cols - 1); - IxIy /= (rows - 1) * (cols - 1); - IyIy /= (rows - 1) * (cols - 1); - score = (IxIx * IyIy - IxIy * IxIy + 0.0001) / (IxIx + IyIy + 0.0001); - return score; -} -#endif - static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, int mi_col) { x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); @@ -6026,6 +6015,375 @@ static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, } #if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 +#define ZERO_MV_MODE 0 +#define NEW_MV_MODE 1 +#define NEAREST_MV_MODE 2 +#define NEAR_MV_MODE 3 +#define MAX_MV_MODE 4 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = *get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row, + mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)sse; + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 9; + const int new_mv_prob = 77; + const int ref_mv_prob = 170; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + int rf_idx, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize, + mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, tpl_frame, rf_idx, + bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, tpl_frame, rf_idx, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double rd_cost(int rdmult, int rddiv, double rate, double dist) { + return (rate * rdmult) / (1 << 9) + dist * (1 << rddiv); +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize, + int mi_row, int mi_col, int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = get_mv_dist(mv_mode, cpi, xd, gf_picture, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row, mi_col); + + return rd_cost(x->rdmult, x->rddiv, mv_cost, mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagnal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = + find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, nb_row, nb_col, &this_rd, mv); + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + new_mv_rd = eval_mv_mode(NEW_MV_MODE, cpi, x, gf_picture, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, + &select_mv_arr[mi_row * stride + mi_col]); + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = + find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, + rf_idx, bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + *rd = no_new_mv_rd; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + } else { + *rd = new_mv_rd; + } +} + +void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + double rd; // TODO(angiebird): Use this information later. + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, rf_idx, bsize, + mi_row, mi_col, &rd); + } + } +} + +static double get_feature_score(uint8_t *buf, ptrdiff_t stride, int rows, + int cols) { + double IxIx = 0; + double IxIy = 0; + double IyIy = 0; + double score; + int r, c; + vpx_clear_system_state(); + for (r = 0; r + 1 < rows; ++r) { + for (c = 0; c + 1 < cols; ++c) { + int diff_x = buf[r * stride + c] - buf[r * stride + c + 1]; + int diff_y = buf[r * stride + c] - buf[(r + 1) * stride + c]; + IxIx += diff_x * diff_x; + IxIy += diff_x * diff_y; + IyIy += diff_y * diff_y; + } + } + IxIx /= (rows - 1) * (cols - 1); + IxIy /= (rows - 1) * (cols - 1); + IyIy /= (rows - 1) * (cols - 1); + score = (IxIx * IyIy - IxIy * IxIy + 0.0001) / (IxIx + IyIy + 0.0001); + return score; +} + static int compare_feature_score(const void *a, const void *b) { const FEATURE_SCORE_LOC *aa = *(FEATURE_SCORE_LOC *const *)a; const FEATURE_SCORE_LOC *bb = *(FEATURE_SCORE_LOC *const *)b; @@ -6156,11 +6514,13 @@ static void build_motion_field(VP9_COMP *cpi, MACROBLOCKD *xd, int frame_idx, TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; int fs_loc_sort_size; int fs_loc_heap_size; int mi_row, mi_col; - tpl_frame->lambda = 250; + tpl_frame->lambda = (pw * ph) / 4; fs_loc_sort_size = 0; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { @@ -6449,6 +6809,10 @@ static void init_tpl_buffer(VP9_COMP *cpi) { cpi->feature_score_loc_alloc = 1; } + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + cm, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); #endif // TODO(jingning): Reduce the actual memory use for tpl model build up. @@ -6459,16 +6823,21 @@ static void init_tpl_buffer(VP9_COMP *cpi) { continue; #if CONFIG_NON_GREEDY_MV - vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr); for (rf_idx = 0; rf_idx < 3; ++rf_idx) { for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) { + vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]); CHECK_MEM_ERROR( cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize], vpx_calloc( - mi_rows * mi_cols, + mi_rows * mi_cols * 4, sizeof( *cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]))); } + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); } #endif vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 18adfebfe..1e1c6b715 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -320,6 +320,7 @@ typedef struct TplDepFrame { double mv_dist_sum[3]; double mv_cost_sum[3]; int_mv *pyramid_mv_arr[3][SQUARE_BLOCK_SIZES]; + int *mv_mode_arr[3]; #endif } TplDepFrame; @@ -337,8 +338,7 @@ static INLINE int get_square_block_idx(BLOCK_SIZE bsize) { if (bsize == BLOCK_32X32) { return 3; } - printf("ERROR: non-square block size\n"); - assert(0); + assert(0 && "ERROR: non-square block size"); return -1; } @@ -355,8 +355,7 @@ static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) { if (square_block_idx == 3) { return BLOCK_32X32; } - printf("ERROR: invalid square_block_idx\n"); - assert(0); + assert(0 && "ERROR: invalid square_block_idx"); return BLOCK_INVALID; } @@ -592,6 +591,7 @@ typedef struct VP9_COMP { FEATURE_SCORE_LOC *feature_score_loc_arr; FEATURE_SCORE_LOC **feature_score_loc_sort; FEATURE_SCORE_LOC **feature_score_loc_heap; + int_mv *select_mv_arr; #endif TileDataEnc *tile_data; @@ -947,8 +947,8 @@ static INLINE RefCntBuffer *get_ref_cnt_buffer(VP9_COMMON *cm, int fb_idx) { } static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( - VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) { + const VP9_COMMON *const cm = &cpi->common; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL; @@ -1027,7 +1027,7 @@ static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { cpi->oxcf.enable_auto_arf; } -static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, +static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { xd->block_refs[0] = diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 8f0da48a2..5cfffe6b5 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -549,7 +549,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { } #define FP_DN_THRESH 8 -#define FP_MAX_DN_THRESH 16 +#define FP_MAX_DN_THRESH 24 #define KERNEL_SIZE 3 // Baseline Kernal weights for first pass noise metric @@ -843,6 +843,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, double mb_intra_factor; double mb_brightness_factor; double mb_neutral_count; + int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); @@ -1254,7 +1255,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, } } #endif - // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { if (mv.row > 0) @@ -1280,14 +1280,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, else if (mv.col < 0) --(fp_acc_data->sum_in_vectors); } - fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + } + if (this_intra_error < scaled_low_intra_thresh) { fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { // 0,0 mv but high error + } else { fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; } } else { // Intra < inter error - int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); if (this_intra_error < scaled_low_intra_thresh) { fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); if (this_motion_error < scaled_low_intra_thresh) { @@ -2399,8 +2398,12 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, twopass->arnr_strength_adjustment = 0; - if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75))) + if (section_noise < 150) { twopass->arnr_strength_adjustment -= 1; + if (section_noise < 75) twopass->arnr_strength_adjustment -= 1; + } else if (section_noise > 250) + twopass->arnr_strength_adjustment += 1; + if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1; } diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 5a6717ab2..63f7f9957 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -29,11 +29,6 @@ // #define NEW_DIAMOND_SEARCH -static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, - const MV *mv) { - return &buf->buf[mv->row * buf->stride + mv->col]; -} - void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); @@ -1646,7 +1641,7 @@ static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, // Exhuastive motion search around a given centre position with a given // step size. -static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, +static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int range, int step, int sad_per_bit, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { @@ -1732,6 +1727,9 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, return best_sad; } +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 #if CONFIG_NON_GREEDY_MV double vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, int mv_num) { @@ -1757,6 +1755,152 @@ double vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, return best_cost; } +static double exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv, + int range, int step, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, double lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + MV fcenter_mv = { center_mv->row, center_mv->col }; + double best_sad; + int r, c, i; + int start_col, end_col, start_row, end_row; + int col_step = (step > 1) ? step : 4; + + assert(step >= 1); + + clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *best_mv = fcenter_mv; + best_sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) + + lambda * vp9_nb_mvs_inconsistency(&fcenter_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row); + start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col); + end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row); + end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col); + + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += col_step) { + // Step > 1 means we are not checking every location in this pass. + if (step > 1) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c }; + double sad = + fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), + in_what->stride); + if (sad < best_sad) { + sad += + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } else { + // 4 sads in a single call if we are checking every location + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + addrs[i] = get_buf_from_mv(in_what, &mv); + } + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); + + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + const double sad = + sads[i] + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, + full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i }; + double sad = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += lambda * + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } + } + } + } + + return best_sad; +} + +static double full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *centre_mv_full, + const vp9_variance_fn_ptr_t *fn_ptr, + MV *dst_mv, double lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + double bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + const MV dummy_mv = { 0, 0 }; + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) { + printf("ERROR: invalid range\n"); + assert(0); + } + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); + range = VPXMIN(range, MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = + exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv, + lambda, nb_full_mvs, full_mv_num); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search_new( + x, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs, + full_mv_num); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + bestsme = vp9_get_mvpred_var(x, &temp_mv, &dummy_mv, fn_ptr, 0); + *dst_mv = temp_mv; + + return bestsme; +} + double vp9_diamond_search_sad_new(const MACROBLOCK *x, const search_site_config *cfg, const MV *init_full_mv, MV *best_full_mv, @@ -2279,11 +2423,14 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, double thissme; double bestsme; const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param; + const MV center_mv = { 0, 0 }; vpx_clear_system_state(); bestsme = vp9_diamond_search_sad_new( x, &cpi->ss_cfg, mvp_full, best_mv, best_mv_dist, best_mv_cost, step_param, lambda, &n, fn_ptr, nb_full_mvs, full_mv_num); + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + // If there won't be more n-step search, check to see if refining search is // needed. if (n > further_steps) do_refine = 0; @@ -2299,6 +2446,7 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, thissme = vp9_diamond_search_sad_new( x, &cpi->ss_cfg, mvp_full, &temp_mv, &mv_dist, &mv_cost, step_param + n, lambda, &num00, fn_ptr, nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); // check to see if refining search is needed. if (num00 > further_steps - n) do_refine = 0; @@ -2320,6 +2468,7 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, thissme = vp9_refining_search_sad_new(x, &temp_mv, &mv_dist, &mv_cost, lambda, search_range, fn_ptr, nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); if (thissme < bestsme) { bestsme = thissme; *best_mv = temp_mv; @@ -2327,6 +2476,9 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, *best_mv_cost = mv_cost; } } + + bestsme = full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda, + nb_full_mvs, full_mv_num); return bestsme; } #endif // CONFIG_NON_GREEDY_MV @@ -2335,7 +2487,8 @@ double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ -static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, +static int full_pixel_diamond(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, @@ -2395,13 +2548,11 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, return bestsme; } -#define MIN_RANGE 7 -#define MAX_RANGE 256 -#define MIN_INTERVAL 1 // Runs an limited range exhaustive mesh search using a pattern set // according to the encode speed profile. -static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, - MV *centre_mv_full, int sadpb, int *cost_list, +static int full_pixel_exhaustive(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *centre_mv_full, + int sadpb, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { const SPEED_FEATURES *const sf = &cpi->sf; @@ -2427,7 +2578,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, interval = VPXMAX(interval, range / baseline_interval_divisor); // initial search - bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, + bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, sadpb, fn_ptr, &temp_mv); if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { @@ -2435,7 +2586,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, // till we reach a step size of 1. Then break out. for (i = 1; i < MAX_MESH_STEP; ++i) { // First pass with coarser step and longer range - bestsme = exhuastive_mesh_search( + bestsme = exhaustive_mesh_search( x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range, sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv); @@ -2663,13 +2814,13 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, return best_sad; } -int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int search_method, - int error_per_bit, int *cost_list, const MV *ref_mv, - MV *tmp_mv, int var_max, int rd) { +int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int search_method, int error_per_bit, int *cost_list, + const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; const SEARCH_METHODS method = (SEARCH_METHODS)search_method; - vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; int run_exhaustive_search = 0; @@ -2714,10 +2865,10 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, if (method == NSTEP) { if (sf->exhaustive_searches_thresh < INT_MAX && !cpi->rc.is_src_frame_alt_ref) { - const int64_t exhuastive_thr = + const int64_t exhaustive_thr = sf->exhaustive_searches_thresh >> (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); - if (var > exhuastive_thr) run_exhaustive_search = 1; + if (var > exhaustive_thr) run_exhaustive_search = 1; } } else if (method == MESH) { run_exhaustive_search = 1; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index ab69afdcd..da93c5d44 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -38,6 +38,11 @@ typedef struct search_site_config { int total_steps; } search_site_config; +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); void vp9_init3smotion_compensation(search_site_config *cfg, int stride); @@ -110,7 +115,8 @@ struct VP9_COMP; // "mvp_full" is the MV search starting point; // "ref_mv" is the context reference MV; // "tmp_mv" is the searched best MV. -int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, +int vp9_full_pixel_search(const struct VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int search_method, int error_per_bit, int *cost_list, const MV *ref_mv, MV *tmp_mv, int var_max, int rd); @@ -143,11 +149,34 @@ static INLINE MV get_full_mv(const MV *mv) { out_mv.col = mv->col >> 3; return out_mv; } - struct TplDepFrame; void vp9_prepare_nb_full_mvs(const struct TplDepFrame *tpl_frame, int mi_row, int mi_col, int rf_idx, BLOCK_SIZE bsize, int_mv *nb_full_mvs); + +static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) { + BLOCK_SIZE square_bsize; + switch (bsize) { + case BLOCK_4X4: + case BLOCK_4X8: + case BLOCK_8X4: square_bsize = BLOCK_4X4; break; + case BLOCK_8X8: + case BLOCK_8X16: + case BLOCK_16X8: square_bsize = BLOCK_8X8; break; + case BLOCK_16X16: + case BLOCK_16X32: + case BLOCK_32X16: square_bsize = BLOCK_16X16; break; + case BLOCK_32X32: + case BLOCK_32X64: + case BLOCK_64X32: + case BLOCK_64X64: square_bsize = BLOCK_32X32; break; + default: + square_bsize = BLOCK_INVALID; + assert(0 && "ERROR: invalid block size"); + break; + } + return square_bsize; +} #endif // CONFIG_NON_GREEDY_MV #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index a3240513f..8cd1e6e31 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1683,6 +1683,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, unsigned int sse_zeromv_normalized = UINT_MAX; unsigned int best_sse_sofar = UINT_MAX; int gf_temporal_ref = 0; + int force_test_gf_zeromv = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1939,6 +1940,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, flag_svc_subpel = 1; } + // For SVC with quality layers, when QP of lower layer is lower + // than current layer: force check of GF-ZEROMV before early exit + // due to skip flag. + if (svc->spatial_layer_id > 0 && no_scaling && + (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + cm->base_qindex > svc->lower_layer_qindex + 10) + force_test_gf_zeromv = 1; + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { int rate_mv = 0; int mode_rd_thresh; @@ -2349,11 +2358,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (reuse_inter_pred) free_pred_buffer(this_mode_pred); } - if (x->skip) break; + if (x->skip && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) + break; // If early termination flag is 1 and at least 2 modes are checked, // the mode search is terminated. - if (best_early_term && idx > 0 && !scene_change_detected) { + if (best_early_term && idx > 0 && !scene_change_detected && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) { x->skip = 1; break; } @@ -2396,6 +2408,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Perform intra prediction search, if the best SAD is above a certain // threshold. if (best_rdc.rdcost == INT64_MAX || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0 && + !x->zero_temp_sad_source) || (scene_change_detected && perform_intra_pred) || ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || x->content_state_sb == kVeryHighSad) && @@ -2438,8 +2452,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, const PREDICTION_MODE this_mode = intra_mode_list[i]; THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; int mode_rd_thresh = rd_threshes[mode_index]; + // For spatially flat blocks, under short_circuit_flat_blocks flag: + // only check DC mode for stationary blocks, otherwise also check + // H and V mode. if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != DC_PRED) { + ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) { continue; } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 5ad68e2e5..9df2eb333 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -247,6 +247,9 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { return target; } +// TODO(marpan/jianj): bits_off_target and buffer_level are used in the saame +// way for CBR mode, for the buffering updates below. Look into removing one +// of these (i.e., bits_off_target). // Update the buffer level before encoding with the per-frame-bandwidth, static void update_buffer_level_preencode(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; @@ -1800,6 +1803,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } + if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi); + // Keep record of last boosted (KF/KF/ARF) Q value. // If the current frame is coded at a lower Q then we also update it. // If all mbs in this group are skipped only update if the Q value is @@ -1922,8 +1927,10 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { // increasing buffer levels/overflow for certain layers even though whole // superframe is dropped, we cap buffer level if its already stable. if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP && - cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) + cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) { cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; + cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; + } } static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index debe88f9d..c73b0ed87 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2316,59 +2316,20 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, } #if CONFIG_NON_GREEDY_MV -#define MAX_PREV_NB_FULL_MV_NUM 8 -static int find_prev_nb_full_mvs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - int ref_frame, BLOCK_SIZE bsize, int mi_row, - int mi_col, int_mv *nb_full_mvs) { - int i; - const TileInfo *tile = &xd->tile; - int full_mv_num = 0; - assert(bsize >= BLOCK_8X8); - for (i = 0; i < MVREF_NEIGHBOURS; ++i) { - const POSITION *mv_ref = &mv_ref_blocks[bsize][i]; - if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MODE_INFO *nb_mi = - xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; - if (nb_mi->sb_type >= BLOCK_8X8) { - if (nb_mi->ref_frame[0] == ref_frame) { - nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[0].as_mv); - ++full_mv_num; - if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { - return full_mv_num; - } - } else if (nb_mi->ref_frame[1] == ref_frame) { - nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[1].as_mv); - ++full_mv_num; - if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { - return full_mv_num; - } - } - } else { - int j; - for (j = 0; j < 4; ++j) { - // TODO(angiebird): avoid using duplicated mvs - if (nb_mi->ref_frame[0] == ref_frame) { - nb_full_mvs[full_mv_num].as_mv = - get_full_mv(&nb_mi->bmi[j].as_mv[0].as_mv); - ++full_mv_num; - if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { - return full_mv_num; - } - } else if (nb_mi->ref_frame[1] == ref_frame) { - nb_full_mvs[full_mv_num].as_mv = - get_full_mv(&nb_mi->bmi[j].as_mv[1].as_mv); - ++full_mv_num; - if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { - return full_mv_num; - } - } - } - } - } +static int ref_frame_to_gf_rf_idx(int ref_frame) { + if (ref_frame == GOLDEN_FRAME) { + return 0; + } + if (ref_frame == LAST_FRAME) { + return 1; } - return full_mv_num; + if (ref_frame == ALTREF_FRAME) { + return 2; + } + assert(0); + return -1; } -#endif // CONFIG_NON_GREEDY_MV +#endif static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, @@ -2395,10 +2356,13 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, double mv_cost = 0; double lambda = (pw * ph) / 4; double bestsme; - int_mv nb_full_mvs[MAX_PREV_NB_FULL_MV_NUM]; - - const int nb_full_mv_num = - find_prev_nb_full_mvs(cm, xd, ref, bsize, mi_row, mi_col, nb_full_mvs); + int_mv nb_full_mvs[NB_MVS_NUM]; + const int nb_full_mv_num = NB_MVS_NUM; + int gf_group_idx = cpi->twopass.gf_group.index; + int gf_rf_idx = ref_frame_to_gf_rf_idx(ref); + BLOCK_SIZE square_bsize = get_square_block_size(bsize); + vp9_prepare_nb_full_mvs(&cpi->tpl_stats[gf_group_idx], mi_row, mi_col, + gf_rf_idx, square_bsize, nb_full_mvs); #else // CONFIG_NON_GREEDY_MV int bestsme = INT_MAX; int sadpb = x->sadperbit16; @@ -3070,7 +3034,7 @@ static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, if (content_type == VP9E_CONTENT_FILM) { if (src_rec_min <= VERY_LOW_VAR_THRESH) { if (ref_frame == INTRA_FRAME) *this_rd *= 2; - if (bsize > 6) *this_rd *= 2; + if (bsize > BLOCK_16X16) *this_rd *= 2; } } } diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 5aede927b..50e25c35c 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -237,6 +237,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 1) { + sf->temporal_filter_search_method = NSTEP; sf->ml_var_partition_pruning = !boosted; sf->ml_prune_rect_partition_threhold[1] = 200; sf->ml_prune_rect_partition_threhold[2] = 200; @@ -906,6 +907,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->allow_acl = 1; sf->enable_tpl_model = oxcf->enable_tpl_model; sf->prune_ref_frame_for_rect_partitions = 0; + sf->temporal_filter_search_method = MESH; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 9b09ec474..98f8de81d 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -595,6 +595,9 @@ typedef struct SPEED_FEATURES { // Allow sub-pixel search to use interpolation filters with different taps in // order to achieve accurate motion search result. SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Search method used by temporal filtering in full_pixel_motion_search. + SEARCH_METHODS temporal_filter_search_method; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 3223f714b..35155c71f 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -1227,3 +1227,25 @@ void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) { cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id]; vp9_new_framerate(cpi, 10000000.0 / this_duration); } + +void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + RATE_CONTROL *const rc = &cpi->rc; + // On key frames in CBR mode: reset the avg_frame_index for base layer + // (to level closer to worst_quality) if the overshoot is significant. + // Reset it for all temporal layers on base spatial layer. + if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) { + int tl; + rc->avg_frame_qindex[INTER_FRAME] = + VPXMAX(rc->avg_frame_qindex[INTER_FRAME], + (cm->base_qindex + rc->worst_quality) >> 1); + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME]; + } + } +} diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index c25644617..34795d841 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -262,6 +262,7 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); +void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index cd340c394..d02603615 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -34,6 +34,9 @@ #include "vpx_scale/vpx_scale.h" static int fixed_divide[512]; +static unsigned int index_mult[14] = { + 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124 +}; static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, @@ -184,7 +187,28 @@ void vp9_temporal_filter_init(void) { static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, int filter_weight) { - int mod = (sum_dist * 3) / index; + int mod; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + mod = + ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int highbd_mod_index(int sum_dist, int index, int rounding, + int strength, int filter_weight) { + int mod = sum_dist * 3 / index; mod += rounding; mod >>= strength; @@ -195,11 +219,12 @@ static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, return mod; } +#endif // CONFIG_VP9_HIGHBITDEPTH static INLINE int get_filter_weight(unsigned int i, unsigned int j, unsigned int block_height, - unsigned int block_width, int *blk_fw, - int use_32x32) { + unsigned int block_width, + const int *const blk_fw, int use_32x32) { int filter_weight = 0; if (use_32x32) @@ -220,12 +245,12 @@ static INLINE int get_filter_weight(unsigned int i, unsigned int j, return filter_weight; } -static void apply_temporal_filter( +void vp9_apply_temporal_filter_c( const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, int *blk_fw, int use_32x32, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { unsigned int i, j, k, m; @@ -361,133 +386,152 @@ static void apply_temporal_filter( } } -// TODO(any): This function is not used anymore. Should be removed. -void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, - const uint8_t *frame2, - unsigned int block_width, - unsigned int block_height, int strength, - int filter_weight, uint32_t *accumulator, - uint16_t *count) { - unsigned int i, j, k; - int modifier; - int byte = 0; - const int rounding = (1 << strength) >> 1; +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_apply_temporal_filter_c( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x; + const int uv_block_height = block_height >> ss_y; + const int y_diff_stride = BW; + const int uv_diff_stride = BW; + + DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]); - assert(strength >= 0); - assert(strength <= 6); + const int rounding = (1 << strength) >> 1; - assert(filter_weight >= 0); - assert(filter_weight <= 2); + // Loop variables + int row, col; + int uv_row, uv_col; + int row_step, col_step; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); - // non-local mean approach - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; + // Get the square diffs + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_diff_sse[row * y_diff_stride + col] = diff * diff; + } + } - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; + for (row = 0; row < (int)uv_block_height; row++) { + for (col = 0; col < (int)uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff; + } + } - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; + // Apply the filter to luma + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = get_filter_weight( + row, col, block_height, block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col]; + y_num_used++; } } } - assert(index > 0); - - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; - - modifier *= 3; - modifier /= index; - - ++frame2; - - modifier += rounding; - modifier >>= strength; - - if (modifier > 16) modifier = 16; - - modifier = 16 - modifier; - modifier *= filter_weight; - - count[k] += modifier; - accumulator[k] += modifier * pixel_value; - - byte++; - } - - byte += stride - block_width; - } -} + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_temporal_filter_apply_c( - const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, - unsigned int block_width, unsigned int block_height, int strength, - int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) { - const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); - const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); - unsigned int i, j, k; - int modifier; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + y_num_used += 2; - int diff_sse[BLK_PELS] = { 0 }; - int this_idx = 0; + // Set the modifier + y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength, + filter_weight); - for (i = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++) { - const int diff = - frame1[i * (int)stride + j] - frame2[i * (int)block_width + j]; - diff_sse[this_idx++] = diff * diff; + // Accumulate the result + y_count[row * block_width + col] += y_mod; + y_accum[row * block_width + col] += y_mod * y_pixel; } } - modifier = 0; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = frame2[i * (int)block_width + j]; - int filter_weight = - get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); - - int idx, idy, index = 0; - - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; - - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - modifier += diff_sse[row * (int)block_width + col]; - ++index; + // Apply the filter to chroma + for (uv_row = 0; uv_row < (int)uv_block_height; uv_row++) { + for (uv_col = 0; uv_col < (int)uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = get_filter_weight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; } } } - assert(index > 0); - - modifier *= 3; - modifier /= index; - - modifier += rounding; - modifier >>= strength; - if (modifier > 16) modifier = 16; + // Sum all the luma pixels associated with the current luma pixel + for (row_step = 0; row_step < 1 + ss_y; row_step++) { + for (col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col]; - modifier = 16 - modifier; - modifier *= filter_weight; + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } - count[k] += modifier; - accumulator[k] += modifier * pixel_value; + // Set the modifier + u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * uv_block_width + uv_col] += u_mod; + u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel; + v_count[uv_row * uv_block_width + uv_col] += v_mod; + v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel; } } } @@ -501,6 +545,7 @@ static uint32_t temporal_filter_find_matching_mb_c( MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; const SEARCH_METHODS search_method = MESH; + const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method; int step_param; int sadpb = x->sadperbit16; uint32_t bestsme = UINT_MAX; @@ -563,7 +608,7 @@ static uint32_t temporal_filter_find_matching_mb_c( vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, - step_param, search_method, sadpb, + step_param, search_method_16, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, &blk_mvs[k], 0, 0); /* restore UMV window */ @@ -671,7 +716,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); #endif // CONFIG_VP9_HIGHBITDEPTH - if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2); + if (src_variance <= 2) { + strength = VPXMAX(0, arnr_filter_data->strength - 2); + } } for (frame = 0; frame < frame_count; frame++) { @@ -740,7 +787,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, } } - if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) { + if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) { // Construct the predictors temporal_filter_predictors_mb_c( mbd, frames[frame]->y_buffer + mb_y_offset, @@ -753,21 +800,20 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply( - f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, BH, - adj_strength, blk_fw, use_32x32, accumulator, count); - vp9_highbd_temporal_filter_apply( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + BLK_PELS, - mb_uv_width, mb_uv_height, adj_strength, blk_fw, use_32x32, - accumulator + BLK_PELS, count + BLK_PELS); - vp9_highbd_temporal_filter_apply( - f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_height, - adj_strength, blk_fw, use_32x32, accumulator + (BLK_PELS << 1), - count + (BLK_PELS << 1)); + vp9_highbd_apply_temporal_filter( + CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride, + CONVERT_TO_SHORTPTR(predictor), BW, + CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset), + CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride, + CONVERT_TO_SHORTPTR(predictor + BLK_PELS), + CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW, + BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y, + adj_strength, blk_fw, use_32x32, accumulator, count, + accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); } else { // Apply the filter (YUV) - apply_temporal_filter( + vp9_apply_temporal_filter( f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), @@ -778,7 +824,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, } #else // Apply the filter (YUV) - apply_temporal_filter( + vp9_apply_temporal_filter( f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), diff --git a/vp9/encoder/vp9_temporal_filter.h b/vp9/encoder/vp9_temporal_filter.h index f5fa194d1..553a46828 100644 --- a/vp9/encoder/vp9_temporal_filter.h +++ b/vp9/encoder/vp9_temporal_filter.h @@ -24,7 +24,7 @@ static const MV kZeroMv = { 0, 0 }; #define BH_LOG2 5 #define BW 32 #define BW_LOG2 5 -#define BLK_PELS 1024 // Pixels in the block +#define BLK_PELS ((BH) * (BW)) // Pixels in the block #define TF_SHIFT 2 #define TF_ROUND 3 #define THR_SHIFT 2 diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/x86/temporal_filter_constants.h new file mode 100644 index 000000000..20b7085a3 --- /dev/null +++ b/vp9/encoder/x86/temporal_filter_constants.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#define DIST_STRIDE ((BW) + 2) diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index e5860d39c..9f9483a9b 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -14,96 +14,58 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/x86/temporal_filter_constants.h" -// Division using multiplication and shifting. The C implementation does: -// modifier *= 3; -// modifier /= index; -// where 'modifier' is a set of summed values and 'index' is the number of -// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values -// which may be bound by the edges of the block being filtered. -// -// This equation works out to (m * 3) / i which reduces to: -// m * 3/4 -// m * 1/2 -// m * 1/3 -// -// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): -// m * C / 65536 -// we can create a C to replicate the division. -// -// m * 49152 / 65536 = m * 3/4 -// m * 32758 / 65536 = m * 1/2 -// m * 21846 / 65536 = m * 0.3333 -// -// These are loaded using an instruction expecting int16_t values but are used -// with _mm_mulhi_epu16(), which treats them as unsigned. -#define NEIGHBOR_CONSTANT_4 (int16_t)49152 -#define NEIGHBOR_CONSTANT_6 (int16_t)32768 -#define NEIGHBOR_CONSTANT_9 (int16_t)21846 - -// Load values from 'a' and 'b'. Compute the difference squared and sum -// neighboring values such that: -// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 -// Values to the left and right of the row are set to 0. -// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. -static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { - const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); - const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); - - const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); - - const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); - const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); - - // Shift all the values one place to the left/right so we can efficiently sum - // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. - const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); - const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); - - // It becomes necessary to treat the values as unsigned at this point. The - // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point - // forward since the filter is only applied to smooth small pixel changes. - // Once the value has saturated to uint16_t it is well outside the useful - // range. - __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); - - *sum = sum_u16; -} +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); -static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, - __m128i *sum_1) { - const __m128i zero = _mm_setzero_si128(); - const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); - const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; - const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); - const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); - const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); - const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} - const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); - const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); - const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); - const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); - __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); - // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. - __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); - __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); + __m128i dist_first, dist_second; - *sum_0 = sum_u16; + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); - shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); - shift_right = _mm_srli_si128(diff_sq_1_u16, 2); + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} - sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); - sum_u16 = _mm_adds_epu16(sum_u16, shift_right); +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} - *sum_1 = sum_u16; +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); } // Average the value based on the number of values summed (9 for pixels away @@ -111,7 +73,7 @@ static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. -static __m128i average_8(__m128i sum, const __m128i mul_constants, +static __m128i average_8(__m128i sum, const __m128i *mul_constants, const int strength, const int rounding, const int weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. @@ -121,7 +83,34 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants, const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; - sum = _mm_mulhi_epu16(sum, mul_constants); + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static __m128i average_4_4(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const int weight_0, const int weight_1) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = + _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1, + weight_1, weight_1); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); @@ -136,20 +125,21 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants, return _mm_mullo_epi16(sum, weight_u16); } -static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, - const __m128i mul_constants_0, - const __m128i mul_constants_1, const int strength, - const int rounding, const int weight) { +static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; - input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); + input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); - input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); + input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); @@ -192,10 +182,10 @@ static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); } -static void accumulate_and_store_16(const __m128i sum_0_u16, - const __m128i sum_1_u16, - const uint8_t *pred, uint16_t *count, - uint32_t *accumulator) { +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), @@ -235,142 +225,773 @@ static void accumulate_and_store_16(const __m128i sum_0_u16, _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); } -void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, - const uint8_t *b, unsigned int width, - unsigned int height, int strength, - int weight, uint32_t *accumulator, - uint16_t *count) { - unsigned int h; +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizaontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizonta add unsigned 16-bit ints in src and store them as signed 32-bit int +// in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_luma_16( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist, + const uint16_t *v_dist, const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; assert(strength >= 0); assert(strength <= 6); - assert(weight >= 0); - assert(weight <= 2); - - assert(width == 8 || width == 16); - - if (width == 8) { - __m128i sum_row_a, sum_row_b, sum_row_c; - __m128i mul_constants = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_8(a, b, &sum_row_a); - sum_8(a + stride, b + width, &sum_row_b); - sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_c, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - - for (h = 0; h < height - 2; ++h) { - sum_8(a, b + width, &sum_row_c); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); - sum_row_a = - average_8(sum_row_a, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - a += stride; - b += width; - count += width; - accumulator += width; - - sum_row_a = sum_row_b; - sum_row_b = sum_row_c; - } + assert(block_width == 16); + + (void)block_width; + + // First row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); - mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); - sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); - accumulate_and_store_8(sum_row_a, b, count, accumulator); - - } else { // width == 16 - __m128i sum_row_a_0, sum_row_a_1; - __m128i sum_row_b_0, sum_row_b_1; - __m128i sum_row_c_0, sum_row_c_1; - __m128i mul_constants_0 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), - mul_constants_1 = _mm_setr_epi16( - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - - sum_16(a, b, &sum_row_a_0, &sum_row_a_1); - sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); - - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); - - a += stride + stride; - b += width; - count += width; - accumulator += width; - - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, - NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); - for (h = 0; h < height - 2; ++h) { - sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); - - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); - - average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + + for (unsigned int h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, strength, rounding, weight); - accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } - a += stride; - b += width; - count += width; - accumulator += width; + // The last row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } - sum_row_a_0 = sum_row_b_0; - sum_row_a_1 = sum_row_b_1; - sum_row_b_0 = sum_row_c_0; - sum_row_b_1 = sum_row_c_1; + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_apply_temporal_filter_luma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist, + const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usualy left-midle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw); } - mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); - mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, - NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); - sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); - sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); - - average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, - strength, rounding, weight); - accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_chroma_8( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + (void)uv_block_width; + + // First row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); } + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_loadu_si128((const __m128i *)neighbors[1]); + + for (unsigned int h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_apply_temporal_filter_chroma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference sqaured + for (unsigned int row = 0; row < block_height; row++) { + for (unsigned int blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (unsigned int row = 0; row < chroma_height; row++) { + for (unsigned int blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_apply_temporal_filter_luma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + vp9_apply_temporal_filter_chroma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); } diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 7ca4004b0..c9a55669e 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -64,9 +64,12 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 05981d689..67e5389a7 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -103,6 +103,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c @@ -137,10 +138,13 @@ VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +endif # !CONFIG_VP9_HIGHBITDEPTH VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c @@ -149,5 +153,6 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 59f612b94..93a5f368b 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -28,5 +28,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h +VP9_DX_SRCS-yes += decoder/vp9_job_queue.c +VP9_DX_SRCS-yes += decoder/vp9_job_queue.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) |