diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_blockd.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_mvref_common.c | 4 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 1 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 8 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodemv.c | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_bitstream.c | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 86 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 60 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_svc_layercontext.c | 63 | ||||
-rw-r--r-- | vp9/encoder/vp9_svc_layercontext.h | 10 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 3 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_quantize_sse2.c | 225 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 6 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 |
19 files changed, 408 insertions, 95 deletions
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c index e13445fd1..7094a0118 100644 --- a/vp9/common/vp9_blockd.c +++ b/vp9/common/vp9_blockd.c @@ -92,7 +92,7 @@ void vp9_foreach_transformed_block(const MACROBLOCKD* const xd, void *arg) { int plane; - for (plane = 0; plane < MAX_MB_PLANE; plane++) + for (plane = 0; plane < MAX_MB_PLANE; ++plane) vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index fbb3d4b5a..3b34050a8 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -24,10 +24,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi : NULL; const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL; - - const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; - int different_ref_found = 0; int context_counter = 0; @@ -127,7 +124,6 @@ static void lower_mv_precision(MV *mv, int allow_hp) { } } - void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, int_mv *nearest, int_mv *near) { int i; diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index c28f156ab..f1eda9117 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -143,7 +143,6 @@ typedef struct VP9Common { int prev_mi_idx; int mi_alloc_size; MODE_INFO *mip_array[2]; - MODE_INFO **mi_grid_base_array[2]; MODE_INFO *mip; /* Base of allocated array */ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c2a918106..de389e7af 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1155,7 +1155,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_b/, "$ssse3_x86_64"; + specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 2690f4887..dc712f045 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -656,10 +656,8 @@ static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, int *width, int *height) { - const int w = vp9_rb_read_literal(rb, 16) + 1; - const int h = vp9_rb_read_literal(rb, 16) + 1; - *width = w; - *height = h; + *width = vp9_rb_read_literal(rb, 16) + 1; + *height = vp9_rb_read_literal(rb, 16) + 1; } static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { @@ -749,7 +747,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, if (!found) vp9_read_frame_size(rb, &width, &height); - if (width <=0 || height <= 0) + if (width <= 0 || height <= 0) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Invalid frame size"); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 3c8e7cc6b..a01fe842e 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -223,7 +223,6 @@ static int read_mv_component(vp9_reader *r, fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d] : mvcomp->fp); - // High precision part (if hp is not used, the default value of the hp is 1) hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp) : 1; diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 694cac76f..3954fe6a7 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1013,7 +1013,11 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || (cpi->svc.number_spatial_layers > 1 && - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) || + (is_two_pass_svc(cpi) && + cpi->svc.encode_empty_frame_state == ENCODING && + cpi->svc.layer_context[0].frames_from_key_frame < + cpi->svc.number_temporal_layers + 1))) { found = 0; } vp9_wb_write_bit(wb, found); @@ -1105,8 +1109,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, // will change to show_frame flag to 0, then add an one byte frame with // show_existing_frame flag which tells the decoder which frame we want to // show. - if (!cm->show_frame || - (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)) + if (!cm->show_frame) vp9_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index b43ff9747..197f54cfc 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -412,29 +412,47 @@ static int set_vt_partitioning(VP9_COMP *cpi, return 1; } - // Vertical split is available on all but the bottom border. - if (mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->vert[0].variance < threshold && - vt.part_variances->vert[1].variance < threshold) { - BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); - set_block_size(cpi, mi_row, mi_col, subsize); - set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); - return 1; - } - - // Horizontal split is available on all but the right border. - if (mi_col + block_width / 2 < cm->mi_cols && - vt.part_variances->horz[0].variance < threshold && - vt.part_variances->horz[1].variance < threshold) { - BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); - set_block_size(cpi, mi_row, mi_col, subsize); - set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); - return 1; + // Only allow split for blocks above 16x16. + if (bsize > BLOCK_16X16) { + // Vertical split is available on all but the bottom border. + if (mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + + // Horizontal split is available on all but the right border. + if (mi_col + block_width / 2 < cm->mi_cols && + vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + + // This will only allow 8x8 if the 16x16 variance is very large. + if (bsize == BLOCK_16X16) { + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < (threshold << 6)) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; + } } return 0; } -// TODO(debargha): Fix this function and make it work as expected. +// This function chooses partitioning based on the variance +// between source and reconstructed last, where variance is +// computed for 8x8 downsampled inputs. Some things to check: +// using the last source rather than reconstructed last, and +// allowing for small downsampling (4x4 or 2x2) for selection +// of smaller block sizes (i.e., < 16x16). static void choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, int mi_row, int mi_col) { @@ -549,27 +567,11 @@ static void choose_partitioning(VP9_COMP *cpi, for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); - // NOTE: This is a temporary hack to disable 8x8 partitions, - // since it works really bad - possibly due to a bug -#define DISABLE_8X8_VAR_BASED_PARTITION -#ifdef DISABLE_8X8_VAR_BASED_PARTITION - if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows && - mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) { - set_block_size(cpi, - (mi_row + y32_idx + y16_idx), - (mi_col + x32_idx + x16_idx), - BLOCK_16X16); - } else { - for (k = 0; k < 4; ++k) { - const int x8_idx = (k & 1); - const int y8_idx = (k >> 1); - set_block_size(cpi, - (mi_row + y32_idx + y16_idx + y8_idx), - (mi_col + x32_idx + x16_idx + x8_idx), - BLOCK_8X8); - } - } -#else + // NOTE: Since this uses 8x8 downsampling for variance calculation + // we cannot really select block size 8x8 (or even 8x16/16x8), + // since we do not sufficient samples for variance. + // For now, 8x8 partition is only set if the variance of the 16x16 + // block is very high. This is controlled in set_vt_partitioning. if (!set_vt_partitioning(cpi, &vt.split[i].split[j], BLOCK_16X16, mi_row + y32_idx + y16_idx, @@ -583,7 +585,6 @@ static void choose_partitioning(VP9_COMP *cpi, BLOCK_8X8); } } -#endif } } } @@ -3158,7 +3159,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int64_t dummy_dist = 0; const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO *mi = cm->mi + idx_str; - MODE_INFO *prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi; BLOCK_SIZE bsize; x->in_static_area = 0; x->source_variance = UINT_MAX; @@ -3196,7 +3196,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root); } else { - copy_partitioning(cm, mi, prev_mi); + choose_partitioning(cpi, tile, mi_row, mi_col); nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rate, &dummy_dist, cpi->pc_root); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8096a9072..1758e3fdb 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -225,6 +225,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { } vpx_memset(&cpi->svc.scaled_frames[0], 0, MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0])); + + vp9_free_frame_buffer(&cpi->svc.empty_frame.img); + vpx_memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame)); } static void save_coding_context(VP9_COMP *cpi) { @@ -585,8 +588,6 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cpi->ref_frame_flags = 0; init_buffer_indices(cpi); - - set_tile_limits(cpi); } static void set_rc_buffer_sizes(RATE_CONTROL *rc, @@ -2981,7 +2982,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } } if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { + // Use the last frame context for the empty frame. cm->frame_context_idx = + (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 : cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id; @@ -3162,7 +3165,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->ref_frame_flags = get_ref_frame_flags(cpi); cm->last_frame_type = cm->frame_type; - vp9_rc_postencode_update(cpi, *size); + + if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) + vp9_rc_postencode_update(cpi, *size); #if 0 output_frame_level_debug_stats(cpi); @@ -3186,12 +3191,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->last_height = cm->height; // reset to normal state now that we are done. - if (!cm->show_existing_frame) { - if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) - cm->last_show_frame = 0; - else - cm->last_show_frame = cm->show_frame; - } + if (!cm->show_existing_frame) + cm->last_show_frame = cm->show_frame; if (cm->show_frame) { vp9_swap_mi_and_prev_mi(cm); @@ -3228,7 +3229,9 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; encode_frame_to_data_rate(cpi, size, dest, frame_flags); - vp9_twopass_postencode_update(cpi); + + if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) + vp9_twopass_postencode_update(cpi); } static void init_motion_estimation(VP9_COMP *cpi) { @@ -3416,6 +3419,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (is_two_pass_svc(cpi)) { #if CONFIG_SPATIAL_SVC vp9_svc_start_frame(cpi); + // Use a small empty frame instead of a real frame + if (cpi->svc.encode_empty_frame_state == ENCODING) + source = &cpi->svc.empty_frame; #endif if (oxcf->pass == 2) vp9_restore_layer_context(cpi); @@ -3434,6 +3440,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); + + // Skip alt frame if we encode the empty frame + if (is_two_pass_svc(cpi) && source != NULL) + arf_src_index = 0; + if (arf_src_index) { assert(arf_src_index <= rc->frames_to_key); @@ -3544,7 +3555,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // For two pass encodes analyse the first pass stats and determine // the bit allocation and other parameters for this frame / group of frames. - if ((oxcf->pass == 2) && (!cpi->use_svc || is_two_pass_svc(cpi))) { + if ((oxcf->pass == 2) && + (!cpi->use_svc || + (is_two_pass_svc(cpi) && + cpi->svc.encode_empty_frame_state != ENCODING))) { vp9_rc_get_second_pass_params(cpi); } @@ -3773,10 +3787,18 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_two_pass_svc(cpi) && cm->show_frame) { - ++cpi->svc.spatial_layer_to_encode; - if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) - cpi->svc.spatial_layer_to_encode = 0; + if (is_two_pass_svc(cpi)) { + if (cpi->svc.encode_empty_frame_state == ENCODING) + cpi->svc.encode_empty_frame_state = ENCODED; + + if (cm->show_frame) { + ++cpi->svc.spatial_layer_to_encode; + if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) + cpi->svc.spatial_layer_to_encode = 0; + + // May need the empty frame after an visible frame. + cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE; + } } return 0; } @@ -3867,10 +3889,6 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, if (width) { cm->width = width; - if (cm->width * 5 < cpi->initial_width) { - cm->width = cpi->initial_width / 5 + 1; - printf("Warning: Desired width too small, changed to %d\n", cm->width); - } if (cm->width > cpi->initial_width) { cm->width = cpi->initial_width; printf("Warning: Desired width too large, changed to %d\n", cm->width); @@ -3879,10 +3897,6 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, if (height) { cm->height = height; - if (cm->height * 5 < cpi->initial_height) { - cm->height = cpi->initial_height / 5 + 1; - printf("Warning: Desired height too small, changed to %d\n", cm->height); - } if (cm->height > cpi->initial_height) { cm->height = cpi->initial_height; printf("Warning: Desired height too large, changed to %d\n", cm->height); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index c9588a343..96c3e0aa4 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2405,6 +2405,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); lc->frames_from_key_frame = 0; + // Reset the empty frame resolution since we have a key frame. + cpi->svc.empty_frame_width = cm->width; + cpi->svc.empty_frame_height = cm->height; } } else { cm->frame_type = INTER_FRAME; @@ -2478,6 +2481,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { if (rc->total_actual_bits) { rc->rate_error_estimate = (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); + rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); } else { rc->rate_error_estimate = 0; } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 42f46917c..b74b2dd56 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -486,8 +486,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; + // Reduce the intra cost penalty for small blocks (<=16x16). + const int reduction_fac = + (cpi->sf.partition_search_type == VAR_BASED_PARTITION && + bsize <= BLOCK_16X16) ? 4 : 1; const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac; const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int intra_mode_cost = 50; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index ef32fe179..65bca669a 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1500,9 +1500,7 @@ void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi, rc->max_gf_interval = 16; // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = oxcf->key_freq >> 1; - if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2)) - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; if (is_altref_enabled(cpi)) { if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 8788be645..bec77d71f 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -271,6 +271,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->partition_search_type = REFERENCE_PARTITION; sf->use_nonrd_pick_mode = 1; sf->allow_skip_recode = 0; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; } if (speed >= 6) { @@ -285,10 +289,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->partition_search_type = VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; sf->mv.search_method = NSTEP; - sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; - sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; - sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; - sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index eed681c96..1573557d4 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -14,6 +14,8 @@ #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_extend.h" +#define SMALL_FRAME_FB_IDX 7 + void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -28,6 +30,25 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { layer_end = svc->number_temporal_layers; } else { layer_end = svc->number_spatial_layers; + + if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { + if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img, + cpi->common.width, cpi->common.height, + cpi->common.subsampling_x, + cpi->common.subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cpi->common.use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate empty frame for multiple frame " + "contexts"); + + vpx_memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80, + cpi->svc.empty_frame.img.buffer_alloc_sz); + cpi->svc.empty_frame_width = cpi->common.width; + cpi->svc.empty_frame_height = cpi->common.height; + } } for (layer = 0; layer < layer_end; ++layer) { @@ -310,6 +331,47 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); + + // Workaround for multiple frame contexts. In some frames we can't use prev_mi + // since its previous frame could be changed during decoding time. The idea is + // we put a empty invisible frame in front of them, then we will not use + // prev_mi when encoding these frames. + if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && + cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) { + if ((cpi->svc.number_temporal_layers > 1 && + cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || + (cpi->svc.number_spatial_layers > 1 && + cpi->svc.spatial_layer_id == 0)) { + struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0); + + if (buf != NULL) { + cpi->svc.empty_frame.ts_start = buf->ts_start; + cpi->svc.empty_frame.ts_end = buf->ts_end; + cpi->svc.encode_empty_frame_state = ENCODING; + cpi->common.show_frame = 0; + cpi->ref_frame_flags = 0; + cpi->common.frame_type = INTER_FRAME; + cpi->lst_fb_idx = + cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX; + + // Gradually make the empty frame smaller to save bits. Make it half of + // its previous size because of the scaling factor restriction. + cpi->svc.empty_frame_width >>= 1; + cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1; + if (cpi->svc.empty_frame_width < 16) + cpi->svc.empty_frame_width = 16; + + cpi->svc.empty_frame_height >>= 1; + cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1; + if (cpi->svc.empty_frame_height < 16) + cpi->svc.empty_frame_height = 16; + + width = cpi->svc.empty_frame_width; + height = cpi->svc.empty_frame_height; + } + } + } + if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; @@ -317,7 +379,6 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q); vp9_change_config(cpi, &cpi->oxcf); - vp9_set_high_precision_mv(cpi, 1); cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 47a5456b6..e9645ce9f 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -50,6 +50,16 @@ typedef struct { int spatial_layer_to_encode; + // Workaround for multiple frame contexts + enum { + ENCODED = 0, + ENCODING, + NEED_TO_ENCODE + }encode_empty_frame_state; + struct lookahead_entry empty_frame; + int empty_frame_width; + int empty_frame_height; + // Store scaled source frames to be used for temporal filter to generate // a alt ref frame. YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS]; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 9ae81e761..5599227ce 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -719,6 +719,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { ++frame_used; } } + cm->mi = cm->mip + cm->mi_stride + 1; + cpi->mb.e_mbd.mi = cm->mi; + cpi->mb.e_mbd.mi[0].src_mi = &cpi->mb.e_mbd.mi[0]; } else { // ARF is produced at the native frame size and resized when coded. #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c new file mode 100644 index 000000000..7c1c8843c --- /dev/null +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> +#include <xmmintrin.h> + +#include "vpx/vpx_integer.h" + +void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t* zbin_ptr, + const int16_t* round_ptr, const int16_t* quant_ptr, + const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr, + int16_t* dqcoeff_ptr, const int16_t* dequant_ptr, + int zbin_oq_value, uint16_t* eob_ptr, + const int16_t* scan_ptr, + const int16_t* iscan_ptr) { + __m128i zero; + (void)scan_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + if (!skip_block) { + __m128i eob; + __m128i zbin; + __m128i round, quant, dequant, shift; + { + __m128i coeff0, coeff1; + + // Setup global values + { + __m128i zbin_oq; + __m128i pw_1; + zbin_oq = _mm_set1_epi16(zbin_oq_value); + zbin = _mm_load_si128((const __m128i*)zbin_ptr); + round = _mm_load_si128((const __m128i*)round_ptr); + quant = _mm_load_si128((const __m128i*)quant_ptr); + zbin = _mm_add_epi16(zbin, zbin_oq); + pw_1 = _mm_set1_epi16(1); + zbin = _mm_sub_epi16(zbin, pw_1); + dequant = _mm_load_si128((const __m128i*)dequant_ptr); + shift = _mm_load_si128((const __m128i*)quant_shift_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + // Do DC and first 15 AC + coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + shift = _mm_unpackhi_epi64(shift, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + + coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs)); + coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0); + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0); + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero); + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero); + _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 041ba27da..d0ca5242c 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -188,11 +188,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) ERROR("Not enough ref buffers for svc alt ref frames"); - if ((cfg->ss_number_layers > 3 || - cfg->ss_number_layers * cfg->ts_number_layers > 4) && + if (cfg->ss_number_layers * cfg->ts_number_layers > 3 && cfg->g_error_resilient == 0) - ERROR("Multiple frame context are not supported for more than 3 spatial " - "layers or more than 4 spatial x temporal layers"); + ERROR("Multiple frame context are not supported for more than 3 layers"); } #endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index a2e3cda7f..ad767229a 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -101,6 +101,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm |