diff options
-rw-r--r-- | examples/vp9_spatial_scalable_encoder.c | 34 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 77 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.h | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_aq_cyclicrefresh.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_int.h | 33 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad4d_intrin_avx2.c | 167 | ||||
-rw-r--r-- | vp9/vp9_dx_iface.c | 1 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 | ||||
-rw-r--r-- | vpx/src/svc_encodeframe.c | 23 |
10 files changed, 275 insertions, 71 deletions
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c index 5c80d34ea..64e62ef29 100644 --- a/examples/vp9_spatial_scalable_encoder.c +++ b/examples/vp9_spatial_scalable_encoder.c @@ -67,13 +67,22 @@ static const arg_def_t pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); static const arg_def_t fpf_name_arg = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); +static const arg_def_t min_q_arg = + ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); +static const arg_def_t max_q_arg = + ARG_DEF(NULL, "max-q", 1, "Maximum quantizer"); +static const arg_def_t min_bitrate_arg = + ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate"); +static const arg_def_t max_bitrate_arg = + ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate"); static const arg_def_t *svc_args[] = { &encoding_mode_arg, &frames_arg, &width_arg, &height_arg, &timebase_arg, &bitrate_arg, &skip_frames_arg, &layers_arg, &kf_dist_arg, &scale_factors_arg, &quantizers_arg, - &quantizers_keyframe_arg, &passes_arg, &pass_arg, - &fpf_name_arg, NULL + &quantizers_keyframe_arg, &passes_arg, &pass_arg, + &fpf_name_arg, &min_q_arg, &max_q_arg, &min_bitrate_arg, + &max_bitrate_arg, NULL }; static const SVC_ENCODING_MODE default_encoding_mode = @@ -120,6 +129,8 @@ static void parse_command_line(int argc, const char **argv_, int passes = 0; int pass = 0; const char *fpf_file_name = NULL; + unsigned int min_bitrate = 0; + unsigned int max_bitrate = 0; // initialize SvcContext with parameters that will be passed to vpx_svc_init svc_ctx->log_level = SVC_LOG_DEBUG; @@ -186,6 +197,14 @@ static void parse_command_line(int argc, const char **argv_, } } else if (arg_match(&arg, &fpf_name_arg, argi)) { fpf_file_name = arg.val; + } else if (arg_match(&arg, &min_q_arg, argi)) { + enc_cfg->rc_min_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_q_arg, argi)) { + enc_cfg->rc_max_quantizer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &min_bitrate_arg, argi)) { + min_bitrate = arg_parse_uint(&arg); + } else if (arg_match(&arg, &max_bitrate_arg, argi)) { + max_bitrate = arg_parse_uint(&arg); } else { ++argj; } @@ -221,6 +240,17 @@ static void parse_command_line(int argc, const char **argv_, app_input->pass = pass; } + if (enc_cfg->rc_target_bitrate > 0) { + if (min_bitrate > 0) { + enc_cfg->rc_2pass_vbr_minsection_pct = + min_bitrate * 100 / enc_cfg->rc_target_bitrate; + } + if (max_bitrate > 0) { + enc_cfg->rc_2pass_vbr_maxsection_pct = + max_bitrate * 100 / enc_cfg->rc_target_bitrate; + } + } + // Check for unrecognized options for (argi = argv; *argi; ++argi) if (argi[0][0] == '-' && strlen(argi[0]) > 1) diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index af8afed84..e48d4178b 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -228,6 +228,12 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { } } +static uint8_t get_filter_level(const loop_filter_info_n *lfi_n, + const MB_MODE_INFO *mbmi) { + return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]] + [mode_lf_lut[mbmi->mode]]; +} + void vp9_loop_filter_init(VP9_COMMON *cm) { loop_filter_info_n *lfi = &cm->lf_info; struct loopfilter *lf = &cm->lf; @@ -493,27 +499,25 @@ static void build_masks(const loop_filter_info_n *const lfi_n, const MODE_INFO *mi, const int shift_y, const int shift_uv, LOOP_FILTER_MASK *lfm) { - const BLOCK_SIZE block_size = mi->mbmi.sb_type; - const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi); - const int skip = mi->mbmi.skip; - const int seg = mi->mbmi.segment_id; - const int ref = mi->mbmi.ref_frame[0]; - const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; - uint64_t *left_y = &lfm->left_y[tx_size_y]; - uint64_t *above_y = &lfm->above_y[tx_size_y]; - uint64_t *int_4x4_y = &lfm->int_4x4_y; - uint16_t *left_uv = &lfm->left_uv[tx_size_uv]; - uint16_t *above_uv = &lfm->above_uv[tx_size_uv]; - uint16_t *int_4x4_uv = &lfm->int_4x4_uv; + const MB_MODE_INFO *mbmi = &mi->mbmi; + const BLOCK_SIZE block_size = mbmi->sb_type; + const TX_SIZE tx_size_y = mbmi->tx_size; + const TX_SIZE tx_size_uv = get_uv_tx_size(mbmi); + const int filter_level = get_filter_level(lfi_n, mbmi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; int i; - int w = num_8x8_blocks_wide_lookup[block_size]; - int h = num_8x8_blocks_high_lookup[block_size]; // If filter level is 0 we don't loop filter. if (!filter_level) { return; } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; int index = shift_y; for (i = 0; i < h; i++) { vpx_memset(&lfm->lfl_y[index], filter_level, w); @@ -540,7 +544,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // If the block has no coefficients and is not intra we skip applying // the loop filter on block edges. - if (skip && ref > INTRA_FRAME) + if (mbmi->skip && is_inter_block(mbmi)) return; // Here we are adding a mask for the transform size. The transform @@ -561,12 +565,11 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // boundaries. These differ from the 4x4 boundaries on the outside edge of // an 8x8 in that the internal ones can be skipped and don't depend on // the prediction block size. - if (tx_size_y == TX_4X4) { + if (tx_size_y == TX_4X4) *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; - } - if (tx_size_uv == TX_4X4) { + + if (tx_size_uv == TX_4X4) *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; - } } // This function does the same thing as the one above with the exception that @@ -575,22 +578,20 @@ static void build_masks(const loop_filter_info_n *const lfi_n, static void build_y_mask(const loop_filter_info_n *const lfi_n, const MODE_INFO *mi, const int shift_y, LOOP_FILTER_MASK *lfm) { - const BLOCK_SIZE block_size = mi->mbmi.sb_type; - const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const int skip = mi->mbmi.skip; - const int seg = mi->mbmi.segment_id; - const int ref = mi->mbmi.ref_frame[0]; - const int filter_level = lfi_n->lvl[seg][ref][mode_lf_lut[mi->mbmi.mode]]; - uint64_t *left_y = &lfm->left_y[tx_size_y]; - uint64_t *above_y = &lfm->above_y[tx_size_y]; - uint64_t *int_4x4_y = &lfm->int_4x4_y; + const MB_MODE_INFO *mbmi = &mi->mbmi; + const BLOCK_SIZE block_size = mbmi->sb_type; + const TX_SIZE tx_size_y = mbmi->tx_size; + const int filter_level = get_filter_level(lfi_n, mbmi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; int i; - int w = num_8x8_blocks_wide_lookup[block_size]; - int h = num_8x8_blocks_high_lookup[block_size]; if (!filter_level) { return; } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; int index = shift_y; for (i = 0; i < h; i++) { vpx_memset(&lfm->lfl_y[index], filter_level, w); @@ -601,7 +602,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, *above_y |= above_prediction_mask[block_size] << shift_y; *left_y |= left_prediction_mask[block_size] << shift_y; - if (skip && ref > INTRA_FRAME) + if (mbmi->skip && is_inter_block(mbmi)) return; *above_y |= (size_mask[block_size] & @@ -610,9 +611,8 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y]) << shift_y; - if (tx_size_y == TX_4X4) { + if (tx_size_y == TX_4X4) *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; - } } // This function sets up the bit masks for the entire 64x64 region represented @@ -868,13 +868,6 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); } -static uint8_t build_lfi(const loop_filter_info_n *lfi_n, - const MB_MODE_INFO *mbmi) { - const int seg = mbmi->segment_id; - const int ref = mbmi->ref_frame[0]; - return lfi_n->lvl[seg][ref][mode_lf_lut[mbmi->mode]]; -} - static void filter_selectively_vert(uint8_t *s, int pitch, unsigned int mask_16x16, unsigned int mask_8x8, @@ -953,7 +946,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, // Filter level can vary per MI if (!(lfl[(r << 3) + (c >> ss_x)] = - build_lfi(&cm->lf_info, &mi[0].mbmi))) + get_filter_level(&cm->lf_info, &mi[0].mbmi))) continue; // Build masks based on the transform size of each block diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index e4cd9d4a0..b874ef3ba 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -636,7 +636,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vp9_sad4x4x8 sse4/; add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x4d sse2/; +specialize qw/vp9_sad64x64x4d sse2 avx2/; add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad32x64x4d sse2/; @@ -651,7 +651,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co specialize qw/vp9_sad16x32x4d sse2/; add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x4d sse2/; +specialize qw/vp9_sad32x32x4d sse2 avx2/; add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad16x16x4d sse2/; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index a9458c4d4..e99b21af6 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -31,10 +31,8 @@ typedef struct { int width; int height; int version; - int postprocess; int max_threads; int inv_tile_order; - int input_partition; } VP9D_CONFIG; typedef struct VP9Decompressor { diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index abf770327..5c80f8a8e 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -150,7 +150,7 @@ void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi) { int qindex_delta = 0; int mbs_in_frame = cm->mi_rows * cm->mi_cols; int i, x, y, block_count, bl_index, bl_index2; - int sum_map, new_value, mi_row, mi_col, xmis, ymis, qindex2; + int sum_map, mi_row, mi_col, xmis, ymis, qindex2; // Rate target ratio to set q delta. float rate_ratio_qdelta = 2.0; @@ -249,9 +249,9 @@ void vp9_setup_cyclic_refresh_aq(VP9_COMP *const cpi) { bl_index2 = bl_index + y * cm->mi_cols + x; sum_map += seg_map[bl_index2]; } - new_value = 0; // If segment is partial over superblock, reset. if (sum_map > 0 && sum_map < xmis * ymis) { + int new_value; if (sum_map < xmis * ymis / 2) new_value = 0; else diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 4bfaacdff..a2147d0b4 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -874,8 +874,8 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc); int vp9_get_quantizer(struct VP9_COMP *cpi); -static int get_ref_frame_idx(const VP9_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { +static INLINE int get_ref_frame_idx(const VP9_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { if (ref_frame == LAST_FRAME) { return cpi->lst_fb_idx; } else if (ref_frame == GOLDEN_FRAME) { @@ -885,15 +885,25 @@ static int get_ref_frame_idx(const VP9_COMP *cpi, } } -static YV12_BUFFER_CONFIG *get_ref_frame_buffer(VP9_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { - VP9_COMMON *const cm = &cpi->common; - return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, - ref_frame)]].buf; +static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( + VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { + VP9_COMMON * const cm = &cpi->common; + return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]] + .buf; } void vp9_set_speed_features(VP9_COMP *cpi); +static INLINE int get_token_alloc(int mb_rows, int mb_cols) { + // TODO(JBB): make this work for alpha channel and double check we can't + // exceed this token count if we have a 32x32 transform crossing a boundary + // at a multiple of 16. + // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full + // resolution. We assume up to 1 token per pixel, and then allow + // a head room of 4. + return mb_rows * mb_cols * (16 * 16 * 3 + 4); +} + int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *reference); @@ -908,16 +918,13 @@ void vp9_scale_references(VP9_COMP *cpi); void vp9_update_reference_frames(VP9_COMP *cpi); -static int get_token_alloc(int mb_rows, int mb_cols) { - return mb_rows * mb_cols * (48 * 16 + 4); -} - extern const int q_trans[]; int64_t vp9_rescale(int64_t val, int64_t num, int denom); -static void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, - MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { +static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0]; xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c new file mode 100644 index 000000000..f31b176e5 --- /dev/null +++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <immintrin.h> // AVX2 +#include "vpx/vpx_integer.h" + +void vp9_sad32x32x4d_avx2(uint8_t *src, + int src_stride, + uint8_t *ref[4], + int ref_stride, + unsigned int res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32 ; i++) { + // load src and all refs + src_reg = _mm256_load_si256((__m256i *)(src)); + ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); + ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); + ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); + ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} + +void vp9_sad64x64x4d_avx2(uint8_t *src, + int src_stride, + uint8_t *ref[4], + int ref_stride, + unsigned int res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64 ; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_load_si256((__m256i *)(src)); + srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); + ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); + ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); + ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); + ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); + ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); + ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); + ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 72701d9d1..4c9350edb 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -280,7 +280,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, oxcf.width = ctx->si.w; oxcf.height = ctx->si.h; oxcf.version = 9; - oxcf.postprocess = 0; oxcf.max_threads = ctx->cfg.threads; oxcf.inv_tile_order = ctx->invert_tile_order; optr = vp9_create_decompressor(&oxcf); diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index a61f737aa..d89571935 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -89,6 +89,7 @@ VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c index d48a761ff..d4f4e9f38 100644 --- a/vpx/src/svc_encodeframe.c +++ b/vpx/src/svc_encodeframe.c @@ -583,8 +583,12 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, enc_cfg->rc_dropframe_thresh = 0; enc_cfg->rc_end_usage = VPX_CBR; enc_cfg->rc_resize_allowed = 0; - enc_cfg->rc_min_quantizer = 33; - enc_cfg->rc_max_quantizer = 33; + + if (enc_cfg->g_pass == VPX_RC_ONE_PASS) { + enc_cfg->rc_min_quantizer = 33; + enc_cfg->rc_max_quantizer = 33; + } + enc_cfg->rc_undershoot_pct = 100; enc_cfg->rc_overshoot_pct = 15; enc_cfg->rc_buf_initial_sz = 500; @@ -784,12 +788,17 @@ static void set_svc_parameters(SvcContext *svc_ctx, } layer_index = layer + VPX_SS_MAX_LAYERS - si->layers; - if (vpx_svc_is_keyframe(svc_ctx)) { - svc_params.min_quantizer = si->quantizer_keyframe[layer_index]; - svc_params.max_quantizer = si->quantizer_keyframe[layer_index]; + if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) { + if (vpx_svc_is_keyframe(svc_ctx)) { + svc_params.min_quantizer = si->quantizer_keyframe[layer_index]; + svc_params.max_quantizer = si->quantizer_keyframe[layer_index]; + } else { + svc_params.min_quantizer = si->quantizer[layer_index]; + svc_params.max_quantizer = si->quantizer[layer_index]; + } } else { - svc_params.min_quantizer = si->quantizer[layer_index]; - svc_params.max_quantizer = si->quantizer[layer_index]; + svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer; + svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer; } svc_params.distance_from_i_frame = si->frame_within_gop; |