diff options
59 files changed, 1322 insertions, 436 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index f98018027..c527cd527 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -666,6 +666,10 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=darwin13 ;; + *darwin14*) + tgt_isa=x86_64 + tgt_os=darwin14 + ;; x86_64*mingw32*) tgt_os=win64 ;; @@ -775,6 +779,10 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.9" add_ldflags "-mmacosx-version-min=10.9" ;; + *-darwin14-*) + add_cflags "-mmacosx-version-min=10.10" + add_ldflags "-mmacosx-version-min=10.10" + ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" @@ -1243,10 +1251,14 @@ EOF fi tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]") + if [ "${tgt_os_no_version}" = "darwin" ] || \ + [ "${tgt_os_no_version}" = "openbsd" ] || [ "`uname`" = "OpenBSD" ]; then + openbsd_like=yes + fi # Default use_x86inc to yes when we are 64 bit, non-pic, or on any # non-Darwin target. if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \ - [ "${tgt_os_no_version}" != "darwin" ]; then + [ "${openbsd_like}" != "yes" ]; then soft_enable use_x86inc fi @@ -58,6 +58,7 @@ Advanced options: ${toggle_postproc_visualizer} macro block / block level visualizers ${toggle_multi_res_encoding} enable multiple-resolution encoding ${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser + ${toggle_vp9_highbitdepth} enable 10/12 bit support in VP9 ${toggle_vp9_temporal_denoising} enable vp9 temporal denoising ${toggle_webm_io} enable input from and output to WebM container @@ -128,6 +129,7 @@ all_platforms="${all_platforms} x86-darwin10-gcc" all_platforms="${all_platforms} x86-darwin11-gcc" all_platforms="${all_platforms} x86-darwin12-gcc" all_platforms="${all_platforms} x86-darwin13-gcc" +all_platforms="${all_platforms} x86-darwin14-gcc" all_platforms="${all_platforms} x86-iphonesimulator-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" @@ -145,6 +147,7 @@ all_platforms="${all_platforms} x86_64-darwin10-gcc" all_platforms="${all_platforms} x86_64-darwin11-gcc" all_platforms="${all_platforms} x86_64-darwin12-gcc" all_platforms="${all_platforms} x86_64-darwin13-gcc" +all_platforms="${all_platforms} x86_64-darwin14-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" @@ -161,6 +164,7 @@ all_platforms="${all_platforms} universal-darwin10-gcc" all_platforms="${all_platforms} universal-darwin11-gcc" all_platforms="${all_platforms} universal-darwin12-gcc" all_platforms="${all_platforms} universal-darwin13-gcc" +all_platforms="${all_platforms} universal-darwin14-gcc" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured diff --git a/libs.doxy_template b/libs.doxy_template index 02e290242..5a8f84728 100644 --- a/libs.doxy_template +++ b/libs.doxy_template @@ -36,7 +36,7 @@ DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. -PROJECT_NAME = "WebM VP8 Codec SDK" +PROJECT_NAME = "WebM Codec SDK" # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. @@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES = 30 SHOW_USED_FILES = YES -# If the sources in your project are distributed over multiple directories -# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy -# in the documentation. The default is NO. - -SHOW_DIRECTORIES = NO - # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from the # version control system). Doxygen will invoke the program by executing (via @@ -715,12 +709,6 @@ HTML_FOOTER = HTML_STYLESHEET = -# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, -# files or namespaces will be aligned in HTML using tables. If set to -# NO a bullet list will be used. - -HTML_ALIGN_MEMBERS = YES - # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compressed HTML help file (.chm) diff --git a/mainpage.dox b/mainpage.dox index e2ec28002..ec202fa4f 100644 --- a/mainpage.dox +++ b/mainpage.dox @@ -1,4 +1,4 @@ -/*!\mainpage WebM VP8 Codec SDK +/*!\mainpage WebM Codec SDK \section main_contents Page Contents - \ref main_intro @@ -6,11 +6,11 @@ - \ref main_support \section main_intro Introduction - Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your - applications with the VP8 video codec, a high quality, royalty free, open - source codec deployed on millions of computers and devices worldwide. + Welcome to the WebM Codec SDK. This SDK allows you to integrate your + applications with the VP8 and VP9 video codecs, high quality, royalty free, + open source codecs deployed on billions of computers and devices worldwide. - This distribution of the WebM VP8 Codec SDK includes the following support: + This distribution of the WebM Codec SDK includes the following support: \if vp8_encoder - \ref vp8_encoder @@ -28,12 +28,12 @@ - Read the \ref samples "sample code" for examples of how to interact with the codec. - \ref codec reference - \if encoder - - \ref encoder reference - \endif - \if decoder - - \ref decoder reference - \endif + \if encoder + - \ref encoder reference + \endif + \if decoder + - \ref decoder reference + \endif \section main_support Support Options & FAQ The WebM project is an open source project supported by its community. For diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 573870e91..e52934771 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -38,6 +38,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, first_drop_ = 0; bits_total_ = 0; duration_ = 0.0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -45,6 +47,17 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, if (video->frame() == 1) { encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } + const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; @@ -124,6 +137,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, double effective_datarate_; size_t bits_in_last_frame_; int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; }; #if CONFIG_TEMPORAL_DENOISING @@ -155,6 +170,29 @@ TEST_P(DatarateTestLarge, DenoiserLevels) { << " The datarate for the file missed the target!"; } } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestLarge, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + << " The datarate for the file missed the target!"; +} #endif // CONFIG_TEMPORAL_DENOISING TEST_P(DatarateTestLarge, BasicBufferModel) { @@ -669,7 +707,7 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) { // Check basic datarate targeting, for a single bitrate, when denoiser is off // and on. -TEST_P(DatarateTestVP9Large, DenoiserOffon) { +TEST_P(DatarateTestVP9Large, DenoiserOffOn) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; cfg_.rc_buf_sz = 1000; diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 77135ec6d..b03235b71 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -114,6 +114,7 @@ void EncoderTest::SetMode(TestMode mode) { static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) { bool match = (img1->fmt == img2->fmt) && + (img1->cs == img2->cs) && (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h); diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 182547bdf..9a99a80f8 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -37,6 +37,7 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, void Reset() { error_nframes_ = 0; droppable_nframes_ = 0; + pattern_switch_ = 0; } virtual void SetUp() { @@ -62,19 +63,37 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, // 1 3 // 0 2 ..... // LAST is updated on base/layer 0, GOLDEN updated on layer 1. - int SetFrameFlags(int frame_num, int num_temp_layers) { + // Non-zero pattern_switch parameter means pattern will switch to + // not using LAST for frame_num >= pattern_switch. + int SetFrameFlags(int frame_num, + int num_temp_layers, + int pattern_switch) { int frame_flags = 0; if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - // Layer 0: predict from L and ARF, update L. - frame_flags = VP8_EFLAG_NO_REF_GF | - VP8_EFLAG_NO_UPD_GF | - VP8_EFLAG_NO_UPD_ARF; - } else { - // Layer 1: predict from L, GF, and ARF, and update GF. - frame_flags = VP8_EFLAG_NO_UPD_ARF | - VP8_EFLAG_NO_UPD_LAST; - } + if (frame_num % 2 == 0) { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 0: predict from LAST and ARF, update LAST. + frame_flags = VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 0: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | + VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } else { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 1: predict from L, GF, and ARF, update GF. + frame_flags = VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST; + } else { + // Layer 1: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | + VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } } return frame_flags; } @@ -86,7 +105,9 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. if (cfg_.ts_number_layers > 1) { - frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers); + frame_flags_ = SetFrameFlags(video->frame(), + cfg_.ts_number_layers, + pattern_switch_); for (unsigned int i = 0; i < droppable_nframes_; ++i) { if (droppable_frames_[i] == video->frame()) { std::cout << "Encoding droppable frame: " @@ -168,11 +189,16 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, return mismatch_nframes_; } + void SetPatternSwitch(int frame_switch) { + pattern_switch_ = frame_switch; + } + private: double psnr_; unsigned int nframes_; unsigned int error_nframes_; unsigned int droppable_nframes_; + unsigned int pattern_switch_; double mismatch_psnr_; unsigned int mismatch_nframes_; unsigned int error_frames_[kMaxErrorFrames]; @@ -299,6 +325,7 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) { // Error resilient mode ON. cfg_.g_error_resilient = 1; cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(0); // The odd frames are the enhancement layer for 2 layer pattern, so set // those frames as droppable. Drop the last 7 frames. @@ -316,6 +343,45 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) { Reset(); } +// Check for successful decoding and no encoder/decoder mismatch +// for a two layer temporal pattern, where at some point in the +// sequence, the LAST ref is not used anymore. +TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 0; + + cfg_.rc_end_usage = VPX_CBR; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 100); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(60); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " + << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0); + + // Reset previously set of error/droppable frames. + Reset(); +} + class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: diff --git a/test/resize_test.cc b/test/resize_test.cc index 9d0c570ae..5c25dc11c 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -144,6 +144,7 @@ class ResizeTest : public ::libvpx_test::EncoderTest, TEST_P(ResizeTest, TestExternalResizeWorks) { ResizingVideoSource video; + cfg_.g_lag_in_frames = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin(); @@ -153,9 +154,9 @@ TEST_P(ResizeTest, TestExternalResizeWorks) { const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight); EXPECT_EQ(expected_w, info->w) - << "Frame " << frame << "had unexpected width"; + << "Frame " << frame << " had unexpected width"; EXPECT_EQ(expected_h, info->h) - << "Frame " << frame << "had unexpected height"; + << "Frame " << frame << " had unexpected height"; } } @@ -261,6 +262,8 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) { } VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_CASE(ResizeTest, + ::testing::Values(::libvpx_test::kRealTime)); VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest, ::testing::Values(::libvpx_test::kOnePassBest)); } // namespace diff --git a/test/variance_test.cc b/test/variance_test.cc index 4d279f686..a8dd7de13 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1914,11 +1914,17 @@ INSTANTIATE_TEST_CASE_P( const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; +const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon; +const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon; +const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9VarianceTest, ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), make_tuple(4, 4, variance16x16_neon, 0), - make_tuple(5, 5, variance32x32_neon, 0))); + make_tuple(5, 5, variance32x32_neon, 0), + make_tuple(5, 6, variance32x64_neon, 0), + make_tuple(6, 5, variance64x32_neon, 0), + make_tuple(6, 6, variance64x64_neon, 0))); const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; @@ -1926,11 +1932,14 @@ const vp9_subpixvariance_fn_t subpel_variance16x16_neon = vp9_sub_pixel_variance16x16_neon; const vp9_subpixvariance_fn_t subpel_variance32x32_neon = vp9_sub_pixel_variance32x32_neon; +const vp9_subpixvariance_fn_t subpel_variance64x64_neon = + vp9_sub_pixel_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9SubpelVarianceTest, ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0), make_tuple(4, 4, subpel_variance16x16_neon, 0), - make_tuple(5, 5, subpel_variance32x32_neon, 0))); + make_tuple(5, 5, subpel_variance32x32_neon, 0), + make_tuple(6, 6, subpel_variance64x64_neon, 0))); #endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index fa04528a2..252ed4efa 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -165,4 +165,14 @@ INSTANTIATE_TEST_CASE_P( #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, AverageTest, + ::testing::Values( + make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon), + make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon), + make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon))); + +#endif + } // namespace diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index 6c354fd38..34e7854a9 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -65,14 +65,15 @@ struct EncodeParameters { int32_t lossless; int32_t error_resilient; int32_t frame_parallel; + vpx_color_space_t cs; // TODO(JBB): quantizers / bitrate }; const EncodeParameters kVP9EncodeParameterSet[] = { - {0, 0, 0, 1, 0}, - {0, 0, 0, 0, 0}, - {0, 0, 1, 0, 0}, - {0, 2, 0, 0, 1}, + {0, 0, 0, 1, 0, VPX_CS_BT_601}, + {0, 0, 0, 0, 0, VPX_CS_BT_709}, + {0, 0, 1, 0, 0, VPX_CS_BT_2020}, + {0, 2, 0, 0, 1, VPX_CS_UNKNOWN}, // TODO(JBB): Test profiles (requires more work). }; @@ -109,6 +110,7 @@ class Vp9EncoderParmsGetToDecoder virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if (video->frame() == 1) { + encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, encode_parms.frame_parallel); @@ -147,7 +149,7 @@ class Vp9EncoderParmsGetToDecoder EXPECT_EQ(common->frame_parallel_decoding_mode, encode_parms.frame_parallel); } - + EXPECT_EQ(common->color_space, encode_parms.cs); EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols); EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows); @@ -80,8 +80,11 @@ The available initialization methods are: - \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif - \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif + \if encoder + - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) + - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) + . + \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index fc026aa9c..d02cd30b9 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -753,45 +753,46 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int ref_frame_map[4]; int sign_bias = 0; int dot_artifact_candidate = 0; - // For detecting dot artifact. - unsigned char* target = x->src.y_buffer; - unsigned char* target_u = x->block[16].src + *x->block[16].base_src; - unsigned char* target_v = x->block[20].src + *x->block[20].base_src; - int stride = x->src.y_stride; - int stride_uv = x->block[16].src_stride; + get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); + + // If the current frame is using LAST as a reference, check for + // biasing the mode selection for dot artifacts. + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + unsigned char* target_y = x->src.y_buffer; + unsigned char* target_u = x->block[16].src + *x->block[16].base_src; + unsigned char* target_v = x->block[20].src + *x->block[20].base_src; + int stride = x->src.y_stride; + int stride_uv = x->block[16].src_stride; #if CONFIG_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; - target = - cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset; - stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride; - if (uv_denoise) { - target_u = - cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + recon_uvoffset; - target_v = - cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + recon_uvoffset; - stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride; + if (cpi->oxcf.noise_sensitivity) { + const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; + target_y = + cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset; + stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride; + if (uv_denoise) { + target_u = + cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + + recon_uvoffset; + target_v = + cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + + recon_uvoffset; + stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride; + } } - } #endif - - get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); - - dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target, stride, - plane[LAST_FRAME][0], mb_row, mb_col, 0); - // If not found in Y channel, check UV channel. - if (!dot_artifact_candidate) { dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target_u, stride_uv, - plane[LAST_FRAME][1], mb_row, mb_col, 1); + check_dot_artifact_candidate(cpi, x, target_y, stride, + plane[LAST_FRAME][0], mb_row, mb_col, 0); + // If not found in Y channel, check UV channel. if (!dot_artifact_candidate) { dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target_v, stride_uv, - plane[LAST_FRAME][2], mb_row, mb_col, 2); + check_dot_artifact_candidate(cpi, x, target_u, stride_uv, + plane[LAST_FRAME][1], mb_row, mb_col, 1); + if (!dot_artifact_candidate) { + dot_artifact_candidate = + check_dot_artifact_candidate(cpi, x, target_v, stride_uv, + plane[LAST_FRAME][2], mb_row, mb_col, 2); + } } } diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 4557e19bf..47e5164d7 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -334,20 +334,6 @@ const vp9_tree_index vp9_switchable_interp_tree -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; -#define COUNT_SAT 20 -#define MAX_UPDATE_FACTOR 128 - -static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { - return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); -} - -static void adapt_probs(const vp9_tree_index *tree, - const vp9_prob *pre_probs, const unsigned int *counts, - vp9_prob *probs) { - vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, - probs); -} - void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; FRAME_CONTEXT *fc = cm->fc; @@ -355,39 +341,41 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { const FRAME_COUNTS *counts = &cm->counts; for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i], - counts->intra_inter[i]); + fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i], + counts->intra_inter[i]); for (i = 0; i < COMP_INTER_CONTEXTS; i++) - fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i], - counts->comp_inter[i]); + fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i], + counts->comp_inter[i]); for (i = 0; i < REF_CONTEXTS; i++) - fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i], - counts->comp_ref[i]); + fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i], + counts->comp_ref[i]); for (i = 0; i < REF_CONTEXTS; i++) for (j = 0; j < 2; j++) - fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j], - counts->single_ref[i][j]); + fc->single_ref_prob[i][j] = mode_mv_merge_probs( + pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); for (i = 0; i < INTER_MODE_CONTEXTS; i++) - adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], + vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i], counts->inter_mode[i], fc->inter_mode_probs[i]); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) - adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], + vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i], counts->y_mode[i], fc->y_mode_prob[i]); for (i = 0; i < INTRA_MODES; ++i) - adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], - counts->uv_mode[i], fc->uv_mode_prob[i]); + vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i], + counts->uv_mode[i], fc->uv_mode_prob[i]); for (i = 0; i < PARTITION_CONTEXTS; i++) - adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], - counts->partition[i], fc->partition_prob[i]); + vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i], + counts->partition[i], fc->partition_prob[i]); if (cm->interp_filter == SWITCHABLE) { for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) - adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], - counts->switchable_interp[i], fc->switchable_interp_prob[i]); + vp9_tree_merge_probs(vp9_switchable_interp_tree, + pre_fc->switchable_interp_prob[i], + counts->switchable_interp[i], + fc->switchable_interp_prob[i]); } if (cm->tx_mode == TX_MODE_SELECT) { @@ -399,23 +387,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); for (j = 0; j < TX_SIZES - 3; ++j) - fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j], - branch_ct_8x8p[j]); + fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); for (j = 0; j < TX_SIZES - 2; ++j) - fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j], - branch_ct_16x16p[j]); + fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); for (j = 0; j < TX_SIZES - 1; ++j) - fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j], - branch_ct_32x32p[j]); + fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs( + pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); } } for (i = 0; i < SKIP_CONTEXTS; ++i) - fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]); + fc->skip_probs[i] = mode_mv_merge_probs( + pre_fc->skip_probs[i], counts->skip[i]); } static void set_default_lf_deltas(struct loopfilter *lf) { diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 922c03947..2477e6ef3 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -11,9 +11,6 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_entropymv.h" -#define MV_COUNT_SAT 20 -#define MV_MAX_UPDATE_FACTOR 128 - // Integer pel reference mv threshold for use of high-precision 1/8 mv #define COMPANDED_MVREF_THRESH 8 @@ -183,16 +180,6 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { } } -static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { - return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); -} - -static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, - const unsigned int *counts, vp9_prob *probs) { - vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, - MV_MAX_UPDATE_FACTOR, probs); -} - void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { int i, j; @@ -200,30 +187,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc; const nmv_context_counts *counts = &cm->counts.mv; - adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints); + vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, + fc->joints); for (i = 0; i < 2; ++i) { nmv_component *comp = &fc->comps[i]; const nmv_component *pre_comp = &pre_fc->comps[i]; const nmv_component_counts *c = &counts->comps[i]; - comp->sign = adapt_prob(pre_comp->sign, c->sign); - adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, - comp->classes); - adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0); + comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign); + vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes, + comp->classes); + vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, + comp->class0); for (j = 0; j < MV_OFFSET_BITS; ++j) - comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]); + comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]); for (j = 0; j < CLASS0_SIZE; ++j) - adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j], - comp->class0_fp[j]); + vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], + c->class0_fp[j], comp->class0_fp[j]); - adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); + vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp); if (allow_hp) { - comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp); - comp->hp = adapt_prob(pre_comp->hp, c->hp); + comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp); + comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp); } } } diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 7454dd439..7938fc10a 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -99,17 +99,6 @@ typedef enum { } TX_TYPE; typedef enum { - UNKNOWN = 0, - BT_601 = 1, // YUV - BT_709 = 2, // YUV - SMPTE_170 = 3, // YUV - SMPTE_240 = 4, // YUV - BT_2020 = 5, // YUV - RESERVED_2 = 6, - SRGB = 7 // RGB -} COLOR_SPACE; - -typedef enum { VP9_LAST_FLAG = 1 << 0, VP9_GOLD_FLAG = 1 << 1, VP9_ALT_FLAG = 1 << 2, diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 58b2da75f..2101ec58c 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, } #endif // CONFIG_VP9_HIGHBITDEPTH -static void filter_block_plane_non420(VP9_COMMON *cm, - struct macroblockd_plane *plane, - MODE_INFO *mi_8x8, - int mi_row, int mi_col) { +void vp9_filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col) { const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; @@ -1598,8 +1598,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, if (use_420) vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); else - filter_block_plane_non420(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } } } diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 4c15e6bd4..6d7cabf7c 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -97,6 +97,11 @@ void vp9_filter_block_plane(struct VP9Common *const cm, int mi_row, LOOP_FILTER_MASK *lfm); +void vp9_filter_block_plane_non420(struct VP9Common *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col); + void vp9_loop_filter_init(struct VP9Common *cm); // Update the loop filter for the current frame. diff --git a/vp9/decoder/vp9_dthread.c b/vp9/common/vp9_loopfilter_thread.c index 3d2d0dd2e..2d47daeaf 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/common/vp9_loopfilter_thread.c @@ -9,14 +9,10 @@ */ #include "./vpx_config.h" - #include "vpx_mem/vpx_mem.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - #if CONFIG_MULTITHREAD static INLINE void mutex_lock(pthread_mutex_t *const mutex) { const int kMaxTryLocks = 4000; @@ -88,31 +84,43 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } // Implement row loopfiltering for each thread. -static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, - VP9_COMMON *const cm, - struct macroblockd_plane planes[MAX_MB_PLANE], - int start, int stop, int y_only, - VP9LfSync *const lf_sync) { +static INLINE +void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int r, c; // SB row and col + const int use_420 = y_only || (planes[1].subsampling_y == 1 && + planes[1].subsampling_x == 1); const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + int mi_row, mi_col; - for (r = start; r < stop; r += lf_sync->num_workers) { - const int mi_row = r << MI_BLOCK_SIZE_LOG2; + for (mi_row = start; mi_row < stop; + mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; - for (c = 0; c < sb_cols; ++c) { - const int mi_col = c << MI_BLOCK_SIZE_LOG2; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; LOOP_FILTER_MASK lfm; int plane; sync_read(lf_sync, r, c); vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); - vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); + + // TODO(JBB): Make setup_mask work for non 420. + if (use_420) + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, + &lfm); for (plane = 0; plane < num_planes; ++plane) { - vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); + if (use_420) + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); + else + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } sync_write(lf_sync, r, c, sb_cols); @@ -123,37 +131,33 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, // Row-based multi-threaded loopfilter hook static int loop_filter_row_worker(VP9LfSync *const lf_sync, LFWorkerData *const lf_data) { - loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, - lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); return 1; } -// VP9 decoder: Implement multi-threaded loopfilter that uses the tile -// threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], - VP9_COMMON *cm, - VP9Worker *workers, int nworkers, - int frame_filter_level, - int y_only) { +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9Worker *workers, int nworkers, + VP9LfSync *lf_sync) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + // Decoder may allocate more threads than number of tiles based on user's + // input. const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = MIN(nworkers, tile_cols); int i; - if (!frame_filter_level) return; - if (!lf_sync->sync_range || cm->last_height != cm->height || num_workers > lf_sync->num_workers) { vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } - vp9_loop_filter_frame_init(cm, frame_filter_level); - // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -175,8 +179,8 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, // Loopfilter data vp9_loop_filter_data_reset(lf_data, frame, cm, planes); - lf_data->start = i; - lf_data->stop = sb_rows; + lf_data->start = start + i * MI_BLOCK_SIZE; + lf_data->stop = stop; lf_data->y_only = y_only; // Start loopfiltering @@ -193,8 +197,33 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, } } +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + if (!frame_filter_level) return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); + + loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, + y_only, workers, num_workers, lf_sync); +} + // Set up nsync by width. -static int get_sync_range(int width) { +static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) diff --git a/vp9/decoder/vp9_dthread.h b/vp9/common/vp9_loopfilter_thread.h index 664aaa32a..bca357e52 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/common/vp9_loopfilter_thread.h @@ -8,23 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - +#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ +#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ #include "./vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_thread.h" -#include "vp9/decoder/vp9_reader.h" -#include "vpx/internal/vpx_codec_internal.h" struct VP9Common; -struct VP9Decoder; - -typedef struct TileWorkerData { - struct VP9Common *cm; - vp9_reader bit_reader; - DECLARE_ALIGNED(16, struct macroblockd, xd); - struct vpx_internal_error_info error_info; -} TileWorkerData; // Loopfilter row synchronization typedef struct VP9LfSyncData { @@ -45,19 +35,19 @@ typedef struct VP9LfSyncData { } VP9LfSync; // Allocate memory for loopfilter row synchronization. -void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows, int width, int num_workers); // Deallocate loopfilter synchronization related mutex and data. void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - VP9Worker *workers, int num_workers, + struct macroblockd_plane planes[MAX_MB_PLANE], int frame_filter_level, - int y_only); + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync); -#endif // VP9_DECODER_VP9_DTHREAD_H_ +#endif // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index e7ee903c6..1494c3fd7 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -35,14 +35,26 @@ static void filter_by_weight(const uint8_t *src, int src_stride, } } +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + static void filter_by_weight32x32(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int weight) { - filter_by_weight(src, src_stride, dst, dst_stride, 16, weight); - filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight); - filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16, - dst_stride, 16, weight); - filter_by_weight(src + src_stride * 16 + 16, src_stride, - dst + dst_stride * 16 + 16, dst_stride, 16, weight); + vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight); + vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride, + weight); + vp9_filter_by_weight16x16(src + src_stride * 16, src_stride, + dst + dst_stride * 16, dst_stride, weight); + vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride, + dst + dst_stride * 16 + 16, dst_stride, weight); } static void filter_by_weight64x64(const uint8_t *src, int src_stride, @@ -62,13 +74,13 @@ static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd, int uvd_stride, BLOCK_SIZE block_size, int weight) { if (block_size == BLOCK_16X16) { - filter_by_weight(y, y_stride, yd, yd_stride, 16, weight); - filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight); - filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight); + vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight); + vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight); + vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight); } else if (block_size == BLOCK_32X32) { filter_by_weight32x32(y, y_stride, yd, yd_stride, weight); - filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight); - filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight); + vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight); + vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight); } else if (block_size == BLOCK_64X64) { filter_by_weight64x64(y, y_stride, yd, yd_stride, weight); filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index ff1ab9fa2..1a957bc99 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -76,7 +76,7 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); - COLOR_SPACE color_space; + vpx_color_space_t color_space; int width; int height; diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c index a1befc63e..3b7b9bf3b 100644 --- a/vp9/common/vp9_prob.c +++ b/vp9/common/vp9_prob.c @@ -29,33 +29,25 @@ const uint8_t vp9_norm[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - static unsigned int tree_merge_probs_impl(unsigned int i, const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update, vp9_prob *probs) { const int l = tree[i]; const unsigned int left_count = (l <= 0) ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, - count_sat, max_update, probs); + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); const int r = tree[i + 1]; const unsigned int right_count = (r <= 0) ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, - count_sat, max_update, probs); + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, - count_sat, max_update); + probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); return left_count + right_count; } void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, - const unsigned int *counts, unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, - max_update_factor, probs); + const unsigned int *counts, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, probs); } diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h index bc1511a5e..c69c62c81 100644 --- a/vp9/common/vp9_prob.h +++ b/vp9/common/vp9_prob.h @@ -33,6 +33,8 @@ typedef int8_t vp9_tree_index; #define vp9_complement(x) (255 - x) +#define MODE_MV_COUNT_SAT 20 + /* We build coding trees compactly in arrays. Each node of the tree is a pair of vp9_tree_indices. Array index often references a corresponding probability table. @@ -69,9 +71,28 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob, return weighted_prob(pre_prob, prob, factor); } +// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; +static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 +}; + +static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob, + const unsigned int ct[2]) { + const unsigned int den = ct[0] + ct[1]; + if (den == 0) { + return pre_prob; + } else { + const unsigned int count = MIN(den, MODE_MV_COUNT_SAT); + const unsigned int factor = count_to_update_factor[count]; + const vp9_prob prob = + clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den); + return weighted_prob(pre_prob, prob, factor); + } +} + void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, - const unsigned int *counts, unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs); + const unsigned int *counts, vp9_prob *probs); DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 88f85a86d..12f076fed 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -274,6 +274,12 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm; add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; specialize qw/vp9_plane_add_noise sse2/; $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; + +add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; +specialize qw/vp9_filter_by_weight16x16 sse2/; + +add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; +specialize qw/vp9_filter_by_weight8x8 sse2/; } # @@ -798,16 +804,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64/, "$sse2_x86inc"; +specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; @@ -837,7 +843,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -1098,7 +1104,7 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; -specialize qw/vp9_avg_8x8 sse2/; +specialize qw/vp9_avg_8x8 sse2 neon/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_avg_4x4 sse2/; @@ -1160,7 +1166,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_fdct8x8_quant sse2 ssse3/; + specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; } # diff --git a/vp9/common/x86/vp9_mfqe_sse2.asm b/vp9/common/x86/vp9_mfqe_sse2.asm new file mode 100644 index 000000000..6029420d1 --- /dev/null +++ b/vp9/common/x86/vp9_mfqe_sse2.asm @@ -0,0 +1,287 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; This file is a duplicate of mfqe_sse2.asm in VP8. +; TODO(jackychen): Find a way to fix the duplicate. +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp9_filter_by_weight16x16_sse2) PRIVATE +sym(vp9_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp9_filter_by_weight8x8_sse2) PRIVATE +sym(vp9_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp9_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE +sym(vp9_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: +%ifndef __NASM_VER__ + ddq 128 +%elif CONFIG_BIG_ENDIAN + dq 0, 128 +%else + dq 128, 0 +%endif +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index ffc59a6ed..ea4edbffe 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -36,7 +36,6 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" -#include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_reader.h" @@ -727,6 +726,8 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { } cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + cm->frame_bufs[cm->new_fb_idx].buf.color_space = + (vpx_color_space_t)cm->color_space; cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; } @@ -1225,8 +1226,8 @@ static void read_bitdepth_colorspace_sampling( cm->use_highbitdepth = 0; #endif } - cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); - if (cm->color_space != SRGB) { + cm->color_space = vp9_rb_read_literal(rb, 3); + if (cm->color_space != VPX_CS_SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { cm->subsampling_x = vp9_rb_read_bit(rb); @@ -1326,7 +1327,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, // of either the color format or color sub-sampling in profile 0. VP9 // specifies that the default color format should be YUV 4:2:0 in this // case (normative). - cm->color_space = BT_601; + cm->color_space = VPX_CS_BT_601; cm->subsampling_y = cm->subsampling_x = 1; cm->bit_depth = VPX_BITS_8; #if CONFIG_VP9_HIGHBITDEPTH @@ -1589,9 +1590,9 @@ void vp9_decode_frame(VP9Decoder *pbi, if (!xd->corrupted) { // If multiple threads are used to decode tiles, then we use those threads // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm, - pbi->tile_workers, pbi->num_tile_workers, - cm->lf.filter_level, 0); + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, + 0, 0, pbi->tile_workers, pbi->num_tile_workers, + &pbi->lf_row_sync); } else { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 1d254d2db..7bef265b8 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -32,7 +32,6 @@ #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_detokenize.h" -#include "vp9/decoder/vp9_dthread.h" static void initialize_dec(void) { static volatile int init_done = 0; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 25b7339ed..1415019a1 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -15,12 +15,11 @@ #include "vpx/vpx_codec.h" #include "vpx_scale/yv12config.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_thread.h" - -#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_reader.h" #ifdef __cplusplus extern "C" { @@ -33,6 +32,13 @@ typedef struct TileData { DECLARE_ALIGNED(16, MACROBLOCKD, xd); } TileData; +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + struct vpx_internal_error_info error_info; +} TileWorkerData; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c new file mode 100644 index 000000000..f505fcb7a --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 6c66f5d5b..a6d4797ad 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -32,6 +32,24 @@ void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { } } +void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, + int16_t* coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t* zbin_ptr, + const int16_t* round_ptr, const int16_t* quant_ptr, + const int16_t* quant_shift_ptr, + int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr, + const int16_t* dequant_ptr, uint16_t* eob_ptr, + const int16_t* scan_ptr, + const int16_t* iscan_ptr) { + int16_t temp_buffer[64]; + (void)coeff_ptr; + + vp9_fdct8x8_neon(input, temp_buffer, stride); + vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_ptr, iscan_ptr); +} + void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { int i; // stage 1 diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index 816fbda1f..b1ad83731 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,6 +10,7 @@ #include <arm_neon.h> #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -28,6 +29,9 @@ enum { kHeight16PlusOne = 17 }; enum { kWidth32 = 32 }; enum { kHeight32 = 32 }; enum { kHeight32PlusOne = 33 }; +enum { kWidth64 = 64 }; +enum { kHeight64 = 64 }; +enum { kHeight64PlusOne = 65 }; enum { kPixelStepOne = 1 }; enum { kAlign16 = 16 }; @@ -46,9 +50,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { return vget_lane_s32(c, 0); } +// w * h must be less than 2048 or local variable v_sum may overflow. static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); @@ -88,7 +93,7 @@ unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 } void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, @@ -103,7 +108,7 @@ unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 } static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, @@ -205,7 +210,62 @@ unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1); + variance_neon_w8(a + (kHeight32 * a_stride), a_stride, + b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1); + variance_neon_w8(a + (kHeight16 * a_stride), a_stride, + b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1); + variance_neon_w8(a + (kHeight16 * a_stride), a_stride, + b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride, + b + (kHeight16 * 2 * b_stride), b_stride, + kWidth64, kHeight16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride, + b + (kHeight16 * 3 * b_stride), b_stride, + kWidth64, kHeight16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 } unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, @@ -225,3 +285,21 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); } + +unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight64PlusOne, kWidth64, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64, + kWidth64, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse); +} diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 19bcfd2b6..3f4ed94d6 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -1055,7 +1055,7 @@ static void write_bitdepth_colorspace_sampling( vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1); } vp9_wb_write_literal(wb, cm->color_space, 3); - if (cm->color_space != SRGB) { + if (cm->color_space != VPX_CS_SRGB) { vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index ab8533703..7d4e26aaf 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -352,6 +352,7 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { int r; const uint8_t *srcbuf = src.y_buffer; uint8_t *destbuf = dest.y_buffer; + assert(dest.y_width == src.y_width); assert(dest.y_height == src.y_height); @@ -362,13 +363,13 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { } } -static void swap_frame_buffer(YV12_BUFFER_CONFIG dest, - YV12_BUFFER_CONFIG src) { - uint8_t *tmp_buf = dest.y_buffer; - assert(dest.y_width == src.y_width); - assert(dest.y_height == src.y_height); - dest.y_buffer = src.y_buffer; - src.y_buffer = tmp_buf; +static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest, + YV12_BUFFER_CONFIG *src) { + uint8_t *tmp_buf = dest->y_buffer; + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + dest->y_buffer = src->y_buffer; + src->y_buffer = tmp_buf; } void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, @@ -387,16 +388,16 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, /* For non key frames */ if (refresh_alt_ref_frame) { - swap_frame_buffer(denoiser->running_avg_y[ALTREF_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); } if (refresh_golden_frame) { - swap_frame_buffer(denoiser->running_avg_y[GOLDEN_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); } if (refresh_last_frame) { - swap_frame_buffer(denoiser->running_avg_y[LAST_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); + swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); } } @@ -477,13 +478,9 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { return; } for (i = 0; i < MAX_REF_FRAMES; ++i) { - if (&denoiser->running_avg_y[i] != NULL) { - vp9_free_frame_buffer(&denoiser->running_avg_y[i]); - } - } - if (&denoiser->mc_running_avg_y != NULL) { - vp9_free_frame_buffer(&denoiser->mc_running_avg_y); + vp9_free_frame_buffer(&denoiser->running_avg_y[i]); } + vp9_free_frame_buffer(&denoiser->mc_running_avg_y); } #ifdef OUTPUT_YUV_DENOISED diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 756052771..e142a3181 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -401,55 +401,42 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, BLOCK_SIZE bsize, int mi_row, - int mi_col) { + int mi_col, + int threshold, + BLOCK_SIZE bsize_min) { VP9_COMMON * const cm = &cpi->common; variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; - // TODO(marpan): Adjust/tune these thresholds. - const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4; - int64_t threshold = - (int64_t)(threshold_multiplier * - vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth)); - int64_t threshold_bsize_ref = threshold << 6; - int64_t threshold_low = threshold; - BLOCK_SIZE bsize_ref = BLOCK_16X16; assert(block_height == block_width); tree_to_node(data, bsize, &vt); - if (cm->frame_type == KEY_FRAME) { - bsize_ref = BLOCK_8X8; - // Choose lower thresholds for key frame variance to favor split, but keep - // threshold for splitting to 4x4 block still fairly high for now. - threshold_bsize_ref = threshold << 2; - threshold_low = threshold >> 2; - } - - // For bsize=bsize_ref (16x16/8x8 for 8x8/4x4 downsampling), select if + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if // variance is below threshold, otherwise split will be selected. // No check for vert/horiz split as too few samples for variance. - if (bsize == bsize_ref) { + if (bsize == bsize_min) { get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->none.variance < threshold_bsize_ref) { + vt.part_variances->none.variance < threshold) { set_block_size(cpi, xd, mi_row, mi_col, bsize); return 1; } return 0; - } else if (bsize > bsize_ref) { + } else if (bsize > bsize_min) { get_variance(&vt.part_variances->none); - // For key frame, for bsize above 32X32, or very high variance, take split. + // For key frame or low_res: for bsize above 32X32 or very high variance, + // take split. if (cm->frame_type == KEY_FRAME && (bsize > BLOCK_32X32 || - vt.part_variances->none.variance > (threshold << 2))) { + vt.part_variances->none.variance > (threshold << 4))) { return 0; } // If variance is low, take the bsize (no split). if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && - vt.part_variances->none.variance < threshold_low) { + vt.part_variances->none.variance < threshold) { set_block_size(cpi, xd, mi_row, mi_col, bsize); return 1; } @@ -458,8 +445,8 @@ static int set_vt_partitioning(VP9_COMP *cpi, if (mi_row + block_height / 2 < cm->mi_rows) { get_variance(&vt.part_variances->vert[0]); get_variance(&vt.part_variances->vert[1]); - if (vt.part_variances->vert[0].variance < threshold_low && - vt.part_variances->vert[1].variance < threshold_low) { + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); set_block_size(cpi, xd, mi_row, mi_col, subsize); set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize); @@ -470,8 +457,8 @@ static int set_vt_partitioning(VP9_COMP *cpi, if (mi_col + block_width / 2 < cm->mi_cols) { get_variance(&vt.part_variances->horz[0]); get_variance(&vt.part_variances->horz[1]); - if (vt.part_variances->horz[0].variance < threshold_low && - vt.part_variances->horz[1].variance < threshold_low) { + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); set_block_size(cpi, xd, mi_row, mi_col, subsize); set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize); @@ -485,8 +472,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, } // This function chooses partitioning based on the variance between source and -// reconstructed last, where variance is computed for downsampled inputs. -// Currently 8x8 downsampling is used for delta frames, 4x4 for key frames. +// reconstructed last, where variance is computed for downs-sampled inputs. static void choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCK *x, @@ -496,6 +482,7 @@ static void choose_partitioning(VP9_COMP *cpi, int i, j, k, m; v64x64 vt; + v16x16 vt2[16]; uint8_t *s; const uint8_t *d; int sp; @@ -504,6 +491,27 @@ static void choose_partitioning(VP9_COMP *cpi, const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; + // Always use 4x4 partition for key frame. + int use_4x4_partition = (cm->frame_type == KEY_FRAME); + + int variance4x4downsample[16]; + int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0; + const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4; + int64_t threshold_base = (int64_t)(threshold_multiplier * + vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth)); + int64_t threshold = threshold_base; + int64_t threshold_bsize_min = threshold_base << 6; + int64_t threshold_bsize_max = threshold_base; + // Modify thresholds for key frame and for low-resolutions (set lower + // thresholds to favor split). + if (cm->frame_type == KEY_FRAME) { + threshold = threshold_base >> 2; + threshold_bsize_min = threshold_base << 2; + } else if (low_res) { + threshold_bsize_min = threshold_base << 3; + threshold_bsize_max = threshold_base >> 2; + } + vp9_clear_system_state(); set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); @@ -546,130 +554,169 @@ static void choose_partitioning(VP9_COMP *cpi, #endif // CONFIG_VP9_HIGHBITDEPTH } - // Fill in the entire tree of 8x8 variances for splits. + // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances + // for splits. for (i = 0; i < 4; i++) { const int x32_idx = ((i & 1) << 5); const int y32_idx = ((i >> 1) << 5); + const int i2 = i << 2; for (j = 0; j < 4; j++) { const int x16_idx = x32_idx + ((j & 1) << 4); const int y16_idx = y32_idx + ((j >> 1) << 4); v16x16 *vst = &vt.split[i].split[j]; - for (k = 0; k < 4; k++) { - int x8_idx = x16_idx + ((k & 1) << 3); - int y8_idx = y16_idx + ((k >> 1) << 3); - if (cm->frame_type != KEY_FRAME) { - unsigned int sse = 0; - int sum = 0; - if (x8_idx < pixels_wide && y8_idx < pixels_high) { - int s_avg, d_avg; + variance4x4downsample[i2 + j] = 0; + if (cm->frame_type != KEY_FRAME) { + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg, d_avg; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); - d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); - } else { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } else { + s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } +#else s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); - } -#else - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); #endif - sum = s_avg - d_avg; - sse = sum * sum; - } - // If variance is based on 8x8 downsampling, we stop here and have - // one sample for 8x8 block (so use 1 for count in fill_variance), - // which of course means variance = 0 for 8x8 block. - fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); - } else { - // For key frame, go down to 4x4. - v8x8 *vst2 = &vst->split[k]; + sum = s_avg - d_avg; + sse = sum * sum; + } + // If variance is based on 8x8 downsampling, we stop here and have + // one sample for 8x8 block (so use 1 for count in fill_variance), + // which of course means variance = 0 for 8x8 block. + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } + fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); + // For low-resolution, compute the variance based on 8x8 down-sampling, + // and if it is large (above the threshold) we go down for 4x4. + // For key frame we always go down to 4x4. + if (low_res) + get_variance(&vt.split[i].split[j].part_variances.none); + } + if (cm->frame_type == KEY_FRAME || (low_res && + vt.split[i].split[j].part_variances.none.variance > + (threshold << 1))) { + // Go down to 4x4 down-sampling for variance. + variance4x4downsample[i2 + j] = 1; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + v8x8 *vst2 = (cm->frame_type == KEY_FRAME) ? &vst->split[k] : + &vt2[i2 + j].split[k]; for (m = 0; m < 4; m++) { int x4_idx = x8_idx + ((m & 1) << 2); int y4_idx = y8_idx + ((m >> 1) << 2); unsigned int sse = 0; int sum = 0; if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int d_avg = 128; #if CONFIG_VP9_HIGHBITDEPTH int s_avg; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (cm->frame_type != KEY_FRAME) + d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); } else { s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (cm->frame_type != KEY_FRAME) + d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); } #else int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (cm->frame_type != KEY_FRAME) + d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); #endif - // For key frame, reference is set to 128. - sum = s_avg - 128; + sum = s_avg - d_avg; sse = sum * sum; } - // If variance is based on 4x4 downsampling, we stop here and have + // If variance is based on 4x4 down-sampling, we stop here and have // one sample for 4x4 block (so use 1 for count in fill_variance), // which of course means variance = 0 for 4x4 block. - fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none); + fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none); } } } } } + // Fill the rest of the variance tree by summing split partition values. for (i = 0; i < 4; i++) { + const int i2 = i << 2; for (j = 0; j < 4; j++) { - if (cm->frame_type == KEY_FRAME) { + if (variance4x4downsample[i2 + j] == 1) { + v16x16 *vtemp = (cm->frame_type != KEY_FRAME) ? &vt2[i2 + j] : + &vt.split[i].split[j]; for (m = 0; m < 4; m++) { - fill_variance_tree(&vt.split[i].split[j].split[m], BLOCK_8X8); + fill_variance_tree(&vtemp->split[m], BLOCK_8X8); } + fill_variance_tree(vtemp, BLOCK_16X16); } - fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); } fill_variance_tree(&vt.split[i], BLOCK_32X32); } fill_variance_tree(&vt, BLOCK_64X64); + // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || - !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col)) { + !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col, + threshold_bsize_max, BLOCK_16X16)) { for (i = 0; i < 4; ++i) { const int x32_idx = ((i & 1) << 2); const int y32_idx = ((i >> 1) << 2); + const int i2 = i << 2; if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32, - (mi_row + y32_idx), (mi_col + x32_idx))) { + (mi_row + y32_idx), (mi_col + x32_idx), + threshold, BLOCK_16X16)) { for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); - // Note: If 8x8 downsampling is used for variance calculation we - // cannot really select block size 8x8 (or even 8x16/16x8), since we - // don't have sufficient samples for variance. So on delta frames, - // 8x8 partition is only set if variance of the 16x16 block is very - // high. For key frames, 4x4 downsampling is used, so we can better - // select 8x16/16x8 and 8x8. 4x4 partition can potentially be set - // used here too, but for now 4x4 is not allowed. - if (!set_vt_partitioning(cpi, xd, &vt.split[i].split[j], - BLOCK_16X16, + // TODO(marpan): Allow 4x4 partitions for inter-frames. + // use_4x4_partition = (variance4x4downsample[i2 + j] == 1); + // If 4x4 partition is not used, then 8x8 partition will be selected + // if variance of 16x16 block is very high, so use larger threshold + // for 16x16 (threshold_bsize_min) in that case. + uint64_t threshold_16x16 = (use_4x4_partition) ? threshold : + threshold_bsize_min; + BLOCK_SIZE bsize_min = (use_4x4_partition) ? BLOCK_8X8 : BLOCK_16X16; + // For inter frames: if variance4x4downsample[] == 1 for this 16x16 + // block, then the variance is based on 4x4 down-sampling, so use vt2 + // in set_vt_partioning(), otherwise use vt. + v16x16 *vtemp = (cm->frame_type != KEY_FRAME && + variance4x4downsample[i2 + j] == 1) ? + &vt2[i2 + j] : &vt.split[i].split[j]; + if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx, - mi_col + x32_idx + x16_idx)) { + mi_col + x32_idx + x16_idx, + threshold_16x16, bsize_min)) { for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - if (cm->frame_type == KEY_FRAME) { - if (!set_vt_partitioning(cpi, xd, - &vt.split[i].split[j].split[k], + if (use_4x4_partition) { + if (!set_vt_partitioning(cpi, xd, &vtemp->split[k], BLOCK_8X8, mi_row + y32_idx + y16_idx + y8_idx, - mi_col + x32_idx + x16_idx + x8_idx)) { - set_block_size(cpi, xd, - (mi_row + y32_idx + y16_idx + y8_idx), - (mi_col + x32_idx + x16_idx + x8_idx), - BLOCK_4X4); + mi_col + x32_idx + x16_idx + x8_idx, + threshold_bsize_min, BLOCK_8X8)) { + set_block_size(cpi, xd, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), + BLOCK_4X4); } } else { set_block_size(cpi, xd, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8); - } + } } } } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index da9f34253..c85bf2a0e 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -607,7 +607,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth = oxcf->use_highbitdepth; #endif - cm->color_space = UNKNOWN; + cm->color_space = oxcf->color_space; cm->width = oxcf->width; cm->height = oxcf->height; @@ -1264,6 +1264,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; + cm->color_space = oxcf->color_space; if (cm->profile <= PROFILE_1) assert(cm->bit_depth == VPX_BITS_8); @@ -1311,6 +1312,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cm->display_width = cpi->oxcf.width; cm->display_height = cpi->oxcf.height; + cm->width = cpi->oxcf.width; + cm->height = cpi->oxcf.height; if (cpi->initial_width) { // Increasing the size of the frame beyond the first seen frame, or some @@ -1785,7 +1788,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { for (t = 0; t < cpi->num_workers; ++t) { VP9Worker *const worker = &cpi->workers[t]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; // Deallocate allocated threads. vp9_get_worker_interface()->end(worker); @@ -1796,11 +1799,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_free_pc_tree(thread_data->td); vpx_free(thread_data->td); } - - vpx_free(worker->data1); } + vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); + if (cpi->num_workers > 1) + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + dealloc_compressor_data(cpi); for (i = 0; i < sizeof(cpi->mbgraph_stats) / @@ -2436,7 +2441,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { } if (lf->filter_level > 0) { - vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, + lf->filter_level, 0, 0, + cpi->workers, cpi->num_workers, + &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); } vp9_extend_frame_inner_borders(cm->frame_to_show); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index d256935f1..cf269c108 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -19,6 +19,7 @@ #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_thread.h" @@ -36,6 +37,7 @@ #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" + #if CONFIG_VP9_TEMPORAL_DENOISING #include "vp9/encoder/vp9_denoiser.h" #endif @@ -231,6 +233,7 @@ typedef struct VP9EncoderConfig { #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth; #endif + vpx_color_space_t color_space; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -261,6 +264,8 @@ typedef struct ThreadData { PC_TREE *pc_root; } ThreadData; +struct EncWorkerData; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -446,6 +451,8 @@ typedef struct VP9_COMP { // Multi-threading int num_workers; VP9Worker *workers; + struct EncWorkerData *tile_thr_data; + VP9LfSync lf_row_sync; } VP9_COMP; void vp9_initialize_enc(void); diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index daf3da44c..12fb4d107 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -167,23 +167,24 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; - winterface->init(worker); - CHECK_MEM_ERROR(cm, worker->data1, - (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData))); - thread_data = (EncWorkerData*)worker->data1; if (i < num_workers - 1) { thread_data->cpi = cpi; // Allocate thread data. CHECK_MEM_ERROR(cm, thread_data->td, - vpx_calloc(1, sizeof(*thread_data->td))); + vpx_memalign(32, sizeof(*thread_data->td))); + vp9_zero(*thread_data->td); + // Set up pc_tree. thread_data->td->leaf_tree = NULL; thread_data->td->pc_tree = NULL; @@ -203,17 +204,18 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { thread_data->td = &cpi->td; } - // data2 is unused. - worker->data2 = NULL; - winterface->sync(worker); - worker->hook = (VP9WorkerHook)enc_worker_hook; } } for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *thread_data; + + worker->hook = (VP9WorkerHook)enc_worker_hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = NULL; + thread_data = (EncWorkerData*)worker->data1; // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 74f5efbec..3b0f2f012 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1843,6 +1843,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); + // Was the group length constrained by the requirement for a new KF? + rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + // Set the interval until the next gf. if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) rc->baseline_gf_interval = i - 1; diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 01cc519f9..a95f0f46d 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -39,8 +39,14 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMMON *const cm = &cpi->common; int64_t filt_err; - vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, - partial_frame); + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, + filt_level, 1, partial_frame, + cpi->workers, cpi->num_workers, &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, + 1, partial_frame); + #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show); diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 3cc9d9a7b..21f4cce03 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1254,7 +1254,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { // better than that already stored. // This is used to help set quality in forced key frames to reduce popping if ((qindex < rc->last_boosted_qindex) || - (((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame || + (cm->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } @@ -1358,8 +1360,12 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; rc->frames_till_gf_update_due = rc->baseline_gf_interval; // NOTE: frames_till_gf_update_due must be <= frames_to_key. - if (rc->frames_till_gf_update_due > rc->frames_to_key) + if (rc->frames_till_gf_update_due > rc->frames_to_key) { rc->frames_till_gf_update_due = rc->frames_to_key; + rc->constrained_gf_group = 1; + } else { + rc->constrained_gf_group = 0; + } cpi->refresh_golden_frame = 1; rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; rc->gfu_boost = DEFAULT_GF_BOOST; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index cf709274d..a53f4e0a2 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -55,6 +55,7 @@ typedef struct { int max_gf_interval; int static_scene_max_gf_interval; int baseline_gf_interval; + int constrained_gf_group; int frames_to_key; int frames_since_key; int this_key_frame_forced; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 375407d44..adbe0244d 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -570,10 +570,6 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) { rd->thresh_mult[THR_NEWA] += 1000; rd->thresh_mult[THR_NEWG] += 1000; - // Adjust threshold only in real time mode, which only uses last - // reference frame. - rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh; - rd->thresh_mult[THR_NEARMV] += 1000; rd->thresh_mult[THR_NEARA] += 1000; rd->thresh_mult[THR_COMP_NEARESTLA] += 1000; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 15831fbbe..81f3195fe 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -446,7 +446,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->always_this_block_size = BLOCK_16X16; sf->search_type_check_frequency = 50; sf->encode_breakout_thresh = 0; - sf->elevate_newmv_thresh = 0; // Recode loop tolerance %. sf->recode_tolerance = 25; sf->default_interp_filter = SWITCHABLE; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index c2cfd62da..eaa0accdb 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -390,9 +390,6 @@ typedef struct SPEED_FEATURES { // enabled in real time mode. int encode_breakout_thresh; - // In real time encoding, increase the threshold for NEWMV. - int elevate_newmv_thresh; - // default interp filter choice INTERP_FILTER default_interp_filter; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 31e93be65..82bce3780 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -279,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org, int vp9_svc_start_frame(VP9_COMP *const cpi) { int width = 0, height = 0; LAYER_CONTEXT *lc; + struct lookahead_entry *buf; int count = 1 << (cpi->svc.number_temporal_layers - 1); cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; @@ -339,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { // since its previous frame could be changed during decoding time. The idea is // we put a empty invisible frame in front of them, then we will not use // prev_mi when encoding these frames. + + buf = vp9_lookahead_peek(cpi->lookahead, 0); if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && - cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) { + cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE && + lc->rc.frames_to_key != 0 && + !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) { if ((cpi->svc.number_temporal_layers > 1 && cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || (cpi->svc.number_spatial_layers > 1 && diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 76602c2d7..58920e247 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h VP9_COMMON_SRCS-yes += common/vp9_enums.h VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h VP9_COMMON_SRCS-yes += common/vp9_mv.h VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h VP9_COMMON_SRCS-yes += common/vp9_pred_common.h @@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h VP9_COMMON_SRCS-yes += common/vp9_quant_common.c @@ -82,6 +84,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 531a03ebe..589f0b1bf 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -42,6 +42,7 @@ struct vp9_extracfg { unsigned int frame_periodic_boost; vpx_bit_depth_t bit_depth; vp9e_tune_content content; + vpx_color_space_t color_space; }; static struct vp9_extracfg default_extra_cfg = { @@ -64,7 +65,8 @@ static struct vp9_extracfg default_extra_cfg = { NO_AQ, // aq_mode 0, // frame_periodic_delta_q VPX_BITS_8, // Bit depth - VP9E_CONTENT_DEFAULT // content + VP9E_CONTENT_DEFAULT, // content + VPX_CS_UNKNOWN, // color space }; struct vpx_codec_alg_priv { @@ -81,6 +83,7 @@ struct vpx_codec_alg_priv { size_t pending_frame_sizes[8]; size_t pending_frame_magnitude; vpx_image_t preview_img; + vpx_enc_frame_flags_t next_frame_flags; vp8_postproc_cfg_t preview_ppcfg; vpx_codec_pkt_list_decl(256) pkt_list; unsigned int fixed_kf_cntr; @@ -294,7 +297,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, cfg->g_bit_depth == VPX_BITS_8) { ERROR("Codec bit-depth 8 not supported in profile > 1"); } - + RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB); return VPX_CODEC_OK; } @@ -437,6 +440,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; #endif + oxcf->color_space = extra_cfg->color_space; oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; oxcf->arnr_strength = extra_cfg->arnr_strength; @@ -521,9 +525,17 @@ static vpx_codec_err_t set_encoder_config( static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; - - if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) - ERROR("Cannot change width or height after initialization"); + int force_key = 0; + + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || + (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + ERROR("Cannot increase width or height larger than their initial values"); + if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h)) + force_key = 1; + } // Prevent increasing lag_in_frames. This check is stricter than it needs // to be -- the limit is not increasing past the first lag_in_frames @@ -540,6 +552,9 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, vp9_change_config(ctx->cpi, &ctx->oxcf); } + if (force_key) + ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF; + return res; } @@ -952,10 +967,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // Store the original flags in to the frame buffer. Will extract the // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(cpi, flags, + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, dst_time_stamp, dst_end_time_stamp)) { res = update_error_state(ctx, &cpi->common.error); } + ctx->next_frame_flags = 0; } cx_data = ctx->cx_data; @@ -1322,6 +1338,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, {VP8E_UPD_ENTROPY, ctrl_update_entropy}, @@ -1357,6 +1380,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback}, {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, + {VP9E_SET_COLOR_SPACE, ctrl_set_color_space}, {VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, // Getters diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index b9177876e..c0e429736 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -95,12 +95,11 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { static int parse_bitdepth_colorspace_sampling( BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) { - const int sRGB = 7; - int colorspace; + vpx_color_space_t color_space; if (profile >= PROFILE_2) rb->bit_offset += 1; // Bit-depth 10 or 12. - colorspace = vp9_rb_read_literal(rb, 3); - if (colorspace != sRGB) { + color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3); + if (color_space != VPX_CS_SRGB) { rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. if (profile == PROFILE_1 || profile == PROFILE_3) { rb->bit_offset += 2; // subsampling x/y. diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h index 00fbfdd7d..e585aa147 100644 --- a/vp9/vp9_iface_common.h +++ b/vp9/vp9_iface_common.h @@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, bps = 12; } } + img->cs = yv12->color_space; img->bit_depth = 8; img->w = yv12->y_stride; img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); @@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; + yv12->color_space = img->cs; #if CONFIG_VP9_HIGHBITDEPTH if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c75fd8a01..33a1e6735 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -150,6 +150,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 1fcb36f66..603158a9c 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -21,8 +21,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_reader.h VP9_DX_SRCS-yes += decoder/vp9_reader.c VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 4fc0fd62f..a920ee3f9 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -10,7 +10,7 @@ #ifndef VPX_VP8CX_H_ #define VPX_VP8CX_H_ -/*!\defgroup vp8_encoder WebM VP8 Encoder +/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder * \ingroup vp8 * * @{ @@ -18,7 +18,7 @@ #include "./vp8.h" /*!\file - * \brief Provides definitions for using the VP8 encoder algorithm within the + * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the * vpx Codec Interface. */ @@ -28,17 +28,20 @@ extern "C" { /*!\name Algorithm interface for VP8 * - * This interface provides the capability to encode raw VP8 streams, as would - * be found in AVI files. + * This interface provides the capability to encode raw VP8 streams. * @{ */ extern vpx_codec_iface_t vpx_codec_vp8_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); +/*!@} - end algorithm interface member group*/ -/* TODO(jkoleszar): These move to VP9 in a later patch set. */ +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to encode raw VP9 streams. + * @{ + */ extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); - /*!@} - end algorithm interface member group*/ @@ -234,20 +237,111 @@ enum vp8e_enc_control_id { VP8E_SET_SCREEN_CONTENT_MODE, /**<control function to set encoder screen content mode */ - /* TODO(jkoleszar): Move to vp9cx.h */ + /*!\brief Codec control function to set lossless encoding mode + * + * VP9 can operate in lossless encoding mode, in which the bitstream + * produced will be able to decode and reconstruct a perfect copy of + * input source. This control function provides a mean to switch encoder + * into lossless coding mode(1) or normal coding mode(0) that may be lossy. + * 0 = lossy coding mode + * 1 = lossless coding mode + * + * By default, encoder operates in normal coding mode (maybe lossy). + */ VP9E_SET_LOSSLESS, + + /*!\brief Codec control function to set number of tile columns + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated vertical tile columns, which can be encoded or decoded + * independently. This enables easy implementation of parallel encoding and + * decoding. This control requests the encoder to use column tiles in + * encoding an input frame, with number of tile columns (in Log2 unit) as + * the parameter: + * 0 = 1 tile column + * 1 = 2 tile columns + * 2 = 4 tile columns + * ..... + * n = 2**n tile columns + * The requested tile columns will be capped by encoder based on image size + * limitation (The minimum width of a tile column is 256 pixel, the maximum + * is 4096). + * + * By default, the value is 0, i.e. one single column tile for entire image. + */ VP9E_SET_TILE_COLUMNS, + + /*!\brief Codec control function to set number of tile rows + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated horizontal tile rows. Tile rows are encoded or decoded + * sequentially. Even though encoding/decoding of later tile rows depends on + * earlier ones, this allows the encoder to output data packets for tile rows + * prior to completely processing all tile rows in a frame, thereby reducing + * the latency in processing between input and output. The parameter + * for this control describes the number of tile rows, which has a valid + * range [0, 2]: + * 0 = 1 tile row + * 1 = 2 tile rows + * 2 = 4 tile rows + * + * By default, the value is 0, i.e. one single row tile for entire image. + */ VP9E_SET_TILE_ROWS, + + /*!\brief Codec control function to enable frame parallel decoding feature + * + * VP9 has a bitstream feature to reduce decoding dependency between frames + * by turning off backward update of probability context used in encoding + * and decoding. This allows staged parallel processing of more than one + * video frames in the decoder. This control function provides a mean to + * turn this feature on or off for bitstreams produced by encoder. + * + * By default, this feature is off. + */ VP9E_SET_FRAME_PARALLEL_DECODING, + + /*!\brief Codec control function to set adaptive quantization mode + * + * VP9 has a segment based feature that allows encoder to adaptively change + * quantization parameter for each segment within a frame to improve the + * subjective quality. This control makes encoder operate in one of the + * several AQ_modes supported. + * + * By default, encoder operates with AQ_Mode 0(adaptive quantization off). + */ VP9E_SET_AQ_MODE, + + /*!\brief Codec control function to enable/disable periodic Q boost + * + * One VP9 encoder speed feature is to enable quality boost by lowering + * frame level Q periodically. This control function provides a mean to + * turn on/off this feature. + * 0 = off + * 1 = on + * + * By default, the encoder is allowed to use this feature for appropriate + * encoding modes. + */ VP9E_SET_FRAME_PERIODIC_BOOST, + /*!\brief control function to set noise sensitivity * * 0: off, 1: OnYOnly */ VP9E_SET_NOISE_SENSITIVITY, + /*!\brief control function to turn on/off SVC in encoder. + * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not + * support SVC in its current encoding mode + * 0: off, 1: on + */ VP9E_SET_SVC, + + /*!\brief control function to set parameters for SVC. + * \note Parameters contain min_q, max_q, scaling factor for each of the + * SVC layers. + */ VP9E_SET_SVC_PARAMETERS, /*!\brief control function to set svc layer for spatial and temporal. @@ -256,9 +350,38 @@ enum vp8e_enc_control_id { * temporal layer. */ VP9E_SET_SVC_LAYER_ID, + + /*!\brief control function to set content type. + * \note Valid parameter range: + * VP9E_CONTENT_DEFAULT = Regular video content (Default) + * VP9E_CONTENT_SCREEN = Screen capture content + */ VP9E_SET_TUNE_CONTENT, + + /*!\brief control function to get svc layer ID. + * \note The layer ID returned is for the data packet from the registered + * callback function. + */ VP9E_GET_SVC_LAYER_ID, + + /*!\brief control function to register callback for getting per layer packet. + * \note Parameter for this control function is a structure with a callback + * function and a pointer to private data used by the callback. + */ VP9E_REGISTER_CX_CALLBACK, + + /*!\brief control function to set color space info. + * \note Valid ranges: 0..7, default is "UNKNOWN". + * 0 = UNKNOWN, + * 1 = BT_601 + * 2 = BT_709 + * 3 = SMPTE_170 + * 4 = SMPTE_240 + * 5 = BT_2020 + * 6 = RESERVED + * 7 = SRGB + */ + VP9E_SET_COLOR_SPACE, }; /*!\brief vpx 1-D scaling mode @@ -423,6 +546,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ + +VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus } // extern "C" diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index 5cc25cd6a..3ba72fe3c 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -9,13 +9,13 @@ */ -/*!\defgroup vp8_decoder WebM VP8 Decoder +/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder * \ingroup vp8 * * @{ */ /*!\file - * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder + * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder * interface. */ #ifndef VPX_VP8DX_H_ @@ -30,14 +30,18 @@ extern "C" { /*!\name Algorithm interface for VP8 * - * This interface provides the capability to decode raw VP8 streams, as would - * be found in AVI files and other non-Flash uses. + * This interface provides the capability to decode VP8 streams. * @{ */ extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); +/*!@} - end algorithm interface member group*/ -/* TODO(jkoleszar): These move to VP9 in a later patch set. */ +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to decode VP9 streams. + * @{ + */ extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); /*!@} - end algorithm interface member group*/ @@ -85,7 +89,14 @@ enum vp8_dec_control_id { */ VP9_SET_BYTE_ALIGNMENT, - /** For testing. */ + /** control function to invert the decoding order to from right to left. The + * function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ VP9_INVERT_TILE_DECODE_ORDER, VP8_DECODER_CTRL_ID_MAX diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index da5bd0659..8f7bff518 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -232,8 +232,8 @@ extern "C" { /*!\brief Callback function pointer / user data pair storage */ typedef struct vpx_codec_enc_output_cx_cb_pair { - vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; - void *user_priv; + vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */ + void *user_priv; /**< Pointer to private data */ } vpx_codec_priv_output_cx_pkt_cb_pair_t; /*!\brief Rational Number @@ -737,10 +737,10 @@ extern "C" { * */ typedef struct vpx_svc_parameters { - int max_quantizers[VPX_SS_MAX_LAYERS]; - int min_quantizers[VPX_SS_MAX_LAYERS]; - int scaling_factor_num[VPX_SS_MAX_LAYERS]; - int scaling_factor_den[VPX_SS_MAX_LAYERS]; + int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/ + int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/ } vpx_svc_extra_cfg_t; diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index 337e4c4be..c06d35101 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -28,7 +28,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/ #define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ @@ -66,9 +66,22 @@ extern "C" { VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ + /*!\brief List of supported color spaces */ + typedef enum vpx_color_space { + VPX_CS_UNKNOWN = 0, /**< Unknown */ + VPX_CS_BT_601 = 1, /**< BT.601 */ + VPX_CS_BT_709 = 2, /**< BT.709 */ + VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ + VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ + VPX_CS_BT_2020 = 5, /**< BT.2020 */ + VPX_CS_RESERVED = 6, /**< Reserved */ + VPX_CS_SRGB = 7 /**< sRGB */ + } vpx_color_space_t; /**< alias for enum vpx_color_space */ + /**\brief Image Descriptor */ typedef struct vpx_image { vpx_img_fmt_t fmt; /**< Image Format */ + vpx_color_space_t cs; /**< Color Space */ /* Image storage dimensions */ unsigned int w; /**< Stored image width */ diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index 81c2b8b87..ae349fb84 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -13,6 +13,7 @@ #define VPX_PORTS_X86_H_ #include <stdlib.h> #include "vpx_config.h" +#include "vpx/vpx_integer.h" #ifdef __cplusplus extern "C" { @@ -104,6 +105,37 @@ void __cpuid(int CPUInfo[4], int info_type); #endif #endif /* end others */ +// NaCl has no support for xgetbv or the raw opcode. +#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__)) +static INLINE uint64_t xgetbv(void) { + const uint32_t ecx = 0; + uint32_t eax, edx; + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm__ volatile ( + ".byte 0x0f, 0x01, 0xd0\n" + : "=a"(eax), "=d"(edx) : "c" (ecx)); + return ((uint64_t)edx << 32) | eax; +} +#elif (defined(_M_X64) || defined(_M_IX86)) && \ + defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1 +#include <immintrin.h> +#define xgetbv() _xgetbv(0) +#elif defined(_MSC_VER) && defined(_M_IX86) +static INLINE uint64_t xgetbv(void) { + uint32_t eax_, edx_; + __asm { + xor ecx, ecx // ecx = 0 + // Use the raw opcode for xgetbv for compatibility with older toolchains. + __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0 + mov eax_, eax + mov edx_, edx + } + return ((uint64_t)edx_ << 32) | eax_; +} +#else +#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. +#endif + #define HAS_MMX 0x01 #define HAS_SSE 0x02 #define HAS_SSE2 0x04 @@ -156,14 +188,17 @@ x86_simd_caps(void) { if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; - if (reg_ecx & BIT(28)) flags |= HAS_AVX; + // bits 27 (OSXSAVE) & 28 (256-bit AVX) + if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { + if ((xgetbv() & 0x6) == 0x6) { + flags |= HAS_AVX; - /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ - reg_eax = 7; - reg_ecx = 0; - cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ + cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); - if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + } + } return flags & mask; } diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h index f04dee1e8..6cdc235fe 100644 --- a/vpx_scale/yv12config.h +++ b/vpx_scale/yv12config.h @@ -55,6 +55,7 @@ typedef struct yv12_buffer_config { int subsampling_x; int subsampling_y; unsigned int bit_depth; + vpx_color_space_t color_space; int corrupted; int flags; @@ -442,7 +442,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_MAX_INTER_BITRATE_PCT, VP8E_SET_GF_CBR_BOOST_PCT, VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY, - VP9E_SET_TUNE_CONTENT, + VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE, 0 }; #endif |