diff options
-rw-r--r-- | examples/vpx_temporal_svc_encoder.c | 2 | ||||
-rw-r--r-- | test/fdct8x8_test.cc | 2 | ||||
-rw-r--r-- | vp8/common/onyx.h | 4 | ||||
-rw-r--r-- | vp8/encoder/denoising.c | 50 | ||||
-rw-r--r-- | vp8/encoder/denoising.h | 26 | ||||
-rw-r--r-- | vp8/encoder/encodeframe.c | 13 | ||||
-rw-r--r-- | vp8/encoder/ethreading.c | 15 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 31 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 2 | ||||
-rw-r--r-- | vp8/encoder/pickinter.c | 10 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_dct_neon.c | 223 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 50 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 | ||||
-rw-r--r-- | vpx_ports/vpx_once.h | 27 |
15 files changed, 413 insertions, 47 deletions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index a7ad9f0c6..be3e7b2f1 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -579,7 +579,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly); } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 6a5d6bb98..567e5f698 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8DCT, ::testing::Values( - make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0))); + make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0))); INSTANTIATE_TEST_CASE_P( DISABLED_NEON, FwdTrans8x8HT, ::testing::Values( diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 7d9441d54..ef7f61b12 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -108,8 +108,8 @@ extern "C" * For temporal denoiser: noise_sensitivity = 0 means off, * noise_sensitivity = 1 means temporal denoiser on for Y channel only, * noise_sensitivity = 2 means temporal denoiser on for all channels. - * noise_sensitivity = 3 will be used for aggressive mode in future. - * Temporal denoiser is enabled via the build option + * noise_sensitivity = 3 means aggressive denoising mode. + * Temporal denoiser is enabled via the configuration option: * CONFIG_TEMPORAL_DENOISING. * For spatial denoiser: noise_sensitivity controls the amount of * pre-processing blur: noise_sensitivity = 0 means off. diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index 1a401a4b9..c4c0de81b 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> + #include "denoising.h" #include "vp8/common/reconinter.h" @@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, return FILTER_BLOCK; } +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) { + if (!denoiser->aggressive_mode) { + denoiser->denoise_pars.scale_sse_thresh = 1; + denoiser->denoise_pars.scale_motion_thresh = 8; + denoiser->denoise_pars.scale_increase_filter = 0; + denoiser->denoise_pars.denoise_mv_bias = 95; + denoiser->denoise_pars.pickmode_mv_bias = 100; + denoiser->denoise_pars.qp_thresh = 0; + denoiser->denoise_pars.consec_zerolast = UINT_MAX; + } else { + denoiser->denoise_pars.scale_sse_thresh = 2; + denoiser->denoise_pars.scale_motion_thresh = 16; + denoiser->denoise_pars.scale_increase_filter = 1; + denoiser->denoise_pars.denoise_mv_bias = 60; + denoiser->denoise_pars.pickmode_mv_bias = 60; + denoiser->denoise_pars.qp_thresh = 100; + denoiser->denoise_pars.consec_zerolast = 10; + } +} + int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols) + int num_mb_rows, int num_mb_cols, int mode) { int i; assert(denoiser); denoiser->num_mb_cols = num_mb_cols; + denoiser->aggressive_mode = mode; for (i = 0; i < MAX_REF_FRAMES; i++) { @@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); - + vp8_denoiser_set_parameters(denoiser); return 0; } + void vp8_denoiser_free(VP8_DENOISER *denoiser) { int i; @@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, { int mv_row; int mv_col; + unsigned int motion_threshold; unsigned int motion_magnitude2; unsigned int sse_thresh; int sse_diff_thresh = 0; @@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi; int sse_diff = 0; // Bias on zero motion vector sse. - int zero_bias = 95; + const int zero_bias = denoiser->denoise_pars.denoise_mv_bias; zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100); sse_diff = zero_mv_sse - best_sse; @@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, mv_row = x->best_sse_mv.as_mv.row; mv_col = x->best_sse_mv.as_mv.col; motion_magnitude2 = mv_row * mv_row + mv_col * mv_col; - sse_thresh = SSE_THRESHOLD; - if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH; + motion_threshold = denoiser->denoise_pars.scale_motion_thresh * + NOISE_MOTION_THRESHOLD; - if (best_sse > sse_thresh || motion_magnitude2 - > 8 * NOISE_MOTION_THRESHOLD) - { - decision = COPY_BLOCK; - } + if (motion_magnitude2 < + denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD) + x->increase_denoising = 1; + + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD; + if (x->increase_denoising) + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH; + + if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold) + decision = COPY_BLOCK; if (decision == FILTER_BLOCK) { diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h index a1f195b72..1a42f86d3 100644 --- a/vp8/encoder/denoising.h +++ b/vp8/encoder/denoising.h @@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state { kFilterNonZeroMV }; +typedef struct { + // Scale factor on sse threshold above which no denoising is done. + unsigned int scale_sse_thresh; + // Scale factor on motion magnitude threshold above which no + // denoising is done. + unsigned int scale_motion_thresh; + // Scale factor on motion magnitude below which we increase the strength of + // the temporal filter (in function vp8_denoiser_filter). + unsigned int scale_increase_filter; + // Scale factor to bias to ZEROMV for denoising. + unsigned int denoise_mv_bias; + // Scale factor to bias to ZEROMV for coding mode selection. + unsigned int pickmode_mv_bias; + // Quantizer threshold below which we use the segmentation map to switch off + // loop filter for blocks that have been coded as ZEROMV-LAST a certain number + // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to + // 0 when segmentation map is used for shutting off loop filter. + unsigned int qp_thresh; + // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST. + unsigned int consec_zerolast; +} denoise_params; + typedef struct vp8_denoiser { YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG yv12_mc_running_avg; unsigned char* denoise_state; int num_mb_cols; + int aggressive_mode; + denoise_params denoise_pars; } VP8_DENOISER; int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols); + int num_mb_rows, int num_mb_cols, int mode); void vp8_denoiser_free(VP8_DENOISER *denoiser); diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index e6b0f9b64..aec6b9880 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi, } #endif + // Keep track of how many (consecutive) times a block is coded + // as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy xd->mbmi.segment_id; (which diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index d4b17cef1..7b8b51f30 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) } #endif + // Keep track of how many (consecutive) times a block + // is coded as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == + LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += + 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 469d0d6e9..e81c05e6f 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) while(block_count && i != cpi->cyclic_refresh_mode_index); cpi->cyclic_refresh_mode_index = i; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->denoiser.aggressive_mode != 0 && + Q < cpi->denoiser.denoise_pars.qp_thresh) { + // Under aggressive denoising mode, use segmentation to turn off loop + // filter below some qp thresh. The loop filter is turned off for all + // blocks that have been encoded as ZEROMV LAST x frames in a row, + // where x is set by cpi->denoiser.denoise_pars.consec_zerolast. + // This is to avoid "dot" artifacts that can occur from repeated + // loop filtering on noisy input source. + cpi->cyclic_refresh_q = Q; + lf_adjustment = -MAX_LOOP_FILTER; + for (i = 0; i < mbs_in_frame; ++i) { + seg_map[i] = (cpi->consec_zero_last[i] > + cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0; + } + } +#endif } /* Activate segmentation. */ @@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; vp8_denoiser_allocate(&cpi->denoiser, width, height, - cpi->common.mb_rows, cpi->common.mb_cols); + cm->mb_rows, cm->mb_cols, + ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0)); } } #endif @@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) else cpi->cyclic_refresh_map = (signed char *) NULL; + CHECK_MEM_ERROR(cpi->consec_zero_last, + vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + #ifdef VP8_ENTROPY_STATS init_context_counters(); #endif @@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) vpx_free(cpi->mb.ss); vpx_free(cpi->tok); vpx_free(cpi->cyclic_refresh_map); + vpx_free(cpi->consec_zero_last); vp8_remove_common(&cpi->common); vpx_free(cpi); @@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate { cpi->mb.rd_thresh_mult[i] = 128; } + + // Reset the zero_last counter to 0 on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); } #if 0 @@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate #endif + #ifdef OUTPUT_YUV_SRC vp8_write_yuv_frame(yuv_file, cpi->Source); #endif @@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate else disable_segmentation(cpi); } + // Reset the consec_zero_last counter on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); vp8_set_quantizer(cpi, Q); } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index df17dff34..7a8baca77 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -511,6 +511,8 @@ typedef struct VP8_COMP int cyclic_refresh_mode_index; int cyclic_refresh_q; signed char *cyclic_refresh_map; + // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST. + unsigned char *consec_zero_last; // Frame counter for the temporal pattern. Counter is rest when the temporal // layers are changed dynamically (run-time change). diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 86108b70a..ec1ea146f 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); - int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, */ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + rd_adjustment = (int)(rd_adjustment * + cpi->denoiser.denoise_pars.pickmode_mv_bias / 100); + } +#endif + /* if we encode a new mv this is important * find the best new motion vector */ @@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0; + int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 666afa4dc..d58774924 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -757,10 +757,10 @@ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stri specialize qw/vp9_fdct4x4 sse2/; add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8_1 sse2/; +specialize qw/vp9_fdct8x8_1 sse2 neon/; add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8 sse2/, "$ssse3_x86_64"; +specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride"; specialize qw/vp9_fdct16x16_1 sse2/; diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 000000000..6c66f5d5b --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} + +void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), + vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), + vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), + vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), + vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from vp9_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index fe15edfdc..1b574758b 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) { const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); double error_weight = 1.0; const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif vp9_clear_system_state(); @@ -600,7 +603,7 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { // initialization - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; } #endif @@ -704,26 +707,20 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { // intra predication statistics - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0; - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &= - (~FPMB_NONZERO_MOTION_MASK); + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_NONZERO_MOTION_MASK; if (this_error > FPMB_ERROR_LEVEL4_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_ERROR_LEVEL4_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK; } else if (this_error > FPMB_ERROR_LEVEL3_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_ERROR_LEVEL3_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK; } else if (this_error > FPMB_ERROR_LEVEL2_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_ERROR_LEVEL2_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK; } else if (this_error > FPMB_ERROR_LEVEL1_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_ERROR_LEVEL1_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK; } else { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= - FPMB_ERROR_LEVEL0_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK; } } #endif @@ -759,25 +756,24 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { // inter predication statistics - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0; - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &= - (~FPMB_DCINTRA_MASK); - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &= - (~FPMB_NONZERO_MOTION_MASK); + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_NONZERO_MOTION_MASK; if (this_error > FPMB_ERROR_LEVEL4_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK; } else if (this_error > FPMB_ERROR_LEVEL3_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK; } else if (this_error > FPMB_ERROR_LEVEL2_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK; } else if (this_error > FPMB_ERROR_LEVEL1_TH) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK; } else { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK; } } @@ -788,7 +784,7 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |= + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_NONZERO_MOTION_MASK; } #endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 3381cb95a..0ea28d313 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -130,5 +130,6 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h index 182892acf..bd9eebd64 100644 --- a/vpx_ports/vpx_once.h +++ b/vpx_ports/vpx_once.h @@ -73,6 +73,33 @@ static void once(void (*func)(void)) } +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include <os2.h> +static void once(void (*func)(void)) +{ + static int done; + + /* If the initialization is complete, return early. */ + if(done) + return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) + { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + + #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include <pthread.h> static void once(void (*func)(void)) |