diff options
72 files changed, 3006 insertions, 1816 deletions
@@ -36,6 +36,7 @@ Advanced options: ${toggle_codec_srcs} in/exclude codec library source code ${toggle_debug_libs} in/exclude debug version of libraries ${toggle_static_msvcrt} use static MSVCRT (VS builds only) + ${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles ${toggle_vp8} VP8 codec support ${toggle_vp9} VP9 codec support ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) @@ -57,6 +58,7 @@ Advanced options: ${toggle_postproc_visualizer} macro block / block level visualizers ${toggle_multi_res_encoding} enable multiple-resolution encoding ${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser + ${toggle_vp9_highbitdepth} enable 10/12 bit support in VP9 ${toggle_vp9_temporal_denoising} enable vp9 temporal denoising ${toggle_webm_io} enable input from and output to WebM container diff --git a/libs.doxy_template b/libs.doxy_template index 02e290242..5a8f84728 100644 --- a/libs.doxy_template +++ b/libs.doxy_template @@ -36,7 +36,7 @@ DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded # by quotes) that should identify the project. -PROJECT_NAME = "WebM VP8 Codec SDK" +PROJECT_NAME = "WebM Codec SDK" # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. @@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES = 30 SHOW_USED_FILES = YES -# If the sources in your project are distributed over multiple directories -# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy -# in the documentation. The default is NO. - -SHOW_DIRECTORIES = NO - # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from the # version control system). Doxygen will invoke the program by executing (via @@ -715,12 +709,6 @@ HTML_FOOTER = HTML_STYLESHEET = -# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, -# files or namespaces will be aligned in HTML using tables. If set to -# NO a bullet list will be used. - -HTML_ALIGN_MEMBERS = YES - # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compressed HTML help file (.chm) diff --git a/mainpage.dox b/mainpage.dox index e2ec28002..ec202fa4f 100644 --- a/mainpage.dox +++ b/mainpage.dox @@ -1,4 +1,4 @@ -/*!\mainpage WebM VP8 Codec SDK +/*!\mainpage WebM Codec SDK \section main_contents Page Contents - \ref main_intro @@ -6,11 +6,11 @@ - \ref main_support \section main_intro Introduction - Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your - applications with the VP8 video codec, a high quality, royalty free, open - source codec deployed on millions of computers and devices worldwide. + Welcome to the WebM Codec SDK. This SDK allows you to integrate your + applications with the VP8 and VP9 video codecs, high quality, royalty free, + open source codecs deployed on billions of computers and devices worldwide. - This distribution of the WebM VP8 Codec SDK includes the following support: + This distribution of the WebM Codec SDK includes the following support: \if vp8_encoder - \ref vp8_encoder @@ -28,12 +28,12 @@ - Read the \ref samples "sample code" for examples of how to interact with the codec. - \ref codec reference - \if encoder - - \ref encoder reference - \endif - \if decoder - - \ref decoder reference - \endif + \if encoder + - \ref encoder reference + \endif + \if decoder + - \ref decoder reference + \endif \section main_support Support Options & FAQ The WebM project is an open source project supported by its community. For diff --git a/test/datarate_test.cc b/test/datarate_test.cc index a76d806bf..e52934771 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -38,6 +38,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, first_drop_ = 0; bits_total_ = 0; duration_ = 0.0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -45,6 +47,17 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, if (video->frame() == 1) { encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } + const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; @@ -124,6 +137,8 @@ class DatarateTestLarge : public ::libvpx_test::EncoderTest, double effective_datarate_; size_t bits_in_last_frame_; int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; }; #if CONFIG_TEMPORAL_DENOISING @@ -155,6 +170,29 @@ TEST_P(DatarateTestLarge, DenoiserLevels) { << " The datarate for the file missed the target!"; } } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestLarge, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3) + << " The datarate for the file missed the target!"; +} #endif // CONFIG_TEMPORAL_DENOISING TEST_P(DatarateTestLarge, BasicBufferModel) { @@ -246,6 +284,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest, for (int i = 0; i < 3; ++i) { bits_total_[i] = 0; } + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; } // @@ -313,10 +353,20 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + if (video->frame() == 1) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); - encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } } + + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + if (cfg_.ts_number_layers > 1) { if (video->frame() == 1) { encoder->Control(VP9E_SET_SVC, 1); @@ -398,6 +448,8 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest, vpx_codec_pts_t first_drop_; int num_drops_; int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; }; // Check basic rate targeting, @@ -652,6 +704,38 @@ TEST_P(DatarateTestVP9Large, DenoiserLevels) { ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) << " The datarate for the file is greater than target by too much!"; } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestVP9Large, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 299); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 300; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} #endif // CONFIG_VP9_TEMPORAL_DENOISING VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES); diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc index 33399e95d..c24d51701 100644 --- a/test/decode_perf_test.cc +++ b/test/decode_perf_test.cc @@ -176,7 +176,8 @@ class VP9NewEncodeDecodePerfTest : // Write frame header and data. ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz); - ASSERT_GT(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), 0); + ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), + pkt->data.frame.sz); } virtual bool DoDecode() { return false; } diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 769e14ae2..7e9f0d6c4 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -189,7 +189,7 @@ TEST_P(VP9EncodePerfTest, PerfTest) { printf("\t\"totalFrames\" : %u,\n", frames); printf("\t\"framesPerSecond\" : %f,\n", fps); printf("\t\"minPsnr\" : %f,\n", minimum_psnr); - printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]); + printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]); printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]); printf("}\n"); } diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 7a133643d..b03235b71 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -64,8 +64,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video, // Encode the frame API_REGISTER_STATE_CHECK( - res = vpx_codec_encode(&encoder_, - video.img(), video.pts(), video.duration(), + res = vpx_codec_encode(&encoder_, img, video.pts(), video.duration(), frame_flags, deadline_)); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } @@ -115,6 +114,7 @@ void EncoderTest::SetMode(TestMode mode) { static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) { bool match = (img1->fmt == img2->fmt) && + (img1->cs == img2->cs) && (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h); diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc index 182547bdf..9a99a80f8 100644 --- a/test/error_resilience_test.cc +++ b/test/error_resilience_test.cc @@ -37,6 +37,7 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, void Reset() { error_nframes_ = 0; droppable_nframes_ = 0; + pattern_switch_ = 0; } virtual void SetUp() { @@ -62,19 +63,37 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, // 1 3 // 0 2 ..... // LAST is updated on base/layer 0, GOLDEN updated on layer 1. - int SetFrameFlags(int frame_num, int num_temp_layers) { + // Non-zero pattern_switch parameter means pattern will switch to + // not using LAST for frame_num >= pattern_switch. + int SetFrameFlags(int frame_num, + int num_temp_layers, + int pattern_switch) { int frame_flags = 0; if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - // Layer 0: predict from L and ARF, update L. - frame_flags = VP8_EFLAG_NO_REF_GF | - VP8_EFLAG_NO_UPD_GF | - VP8_EFLAG_NO_UPD_ARF; - } else { - // Layer 1: predict from L, GF, and ARF, and update GF. - frame_flags = VP8_EFLAG_NO_UPD_ARF | - VP8_EFLAG_NO_UPD_LAST; - } + if (frame_num % 2 == 0) { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 0: predict from LAST and ARF, update LAST. + frame_flags = VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 0: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | + VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } else { + if (frame_num < pattern_switch || pattern_switch == 0) { + // Layer 1: predict from L, GF, and ARF, update GF. + frame_flags = VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_UPD_LAST; + } else { + // Layer 1: predict from GF and ARF, update GF. + frame_flags = VP8_EFLAG_NO_REF_LAST | + VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ARF; + } + } } return frame_flags; } @@ -86,7 +105,9 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. if (cfg_.ts_number_layers > 1) { - frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers); + frame_flags_ = SetFrameFlags(video->frame(), + cfg_.ts_number_layers, + pattern_switch_); for (unsigned int i = 0; i < droppable_nframes_; ++i) { if (droppable_frames_[i] == video->frame()) { std::cout << "Encoding droppable frame: " @@ -168,11 +189,16 @@ class ErrorResilienceTestLarge : public ::libvpx_test::EncoderTest, return mismatch_nframes_; } + void SetPatternSwitch(int frame_switch) { + pattern_switch_ = frame_switch; + } + private: double psnr_; unsigned int nframes_; unsigned int error_nframes_; unsigned int droppable_nframes_; + unsigned int pattern_switch_; double mismatch_psnr_; unsigned int mismatch_nframes_; unsigned int error_frames_[kMaxErrorFrames]; @@ -299,6 +325,7 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) { // Error resilient mode ON. cfg_.g_error_resilient = 1; cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(0); // The odd frames are the enhancement layer for 2 layer pattern, so set // those frames as droppable. Drop the last 7 frames. @@ -316,6 +343,45 @@ TEST_P(ErrorResilienceTestLarge, 2LayersDropEnhancement) { Reset(); } +// Check for successful decoding and no encoder/decoder mismatch +// for a two layer temporal pattern, where at some point in the +// sequence, the LAST ref is not used anymore. +TEST_P(ErrorResilienceTestLarge, 2LayersNoRefLast) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 500; + cfg_.g_lag_in_frames = 0; + + cfg_.rc_end_usage = VPX_CBR; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + + init_flags_ = VPX_CODEC_USE_PSNR; + + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + timebase.den, timebase.num, 0, 100); + + // Error resilient mode ON. + cfg_.g_error_resilient = 1; + cfg_.kf_mode = VPX_KF_DISABLED; + SetPatternSwitch(60); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Test that no mismatches have been found + std::cout << " Mismatch frames: " + << GetMismatchFrames() << "\n"; + EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0); + + // Reset previously set of error/droppable frames. + Reset(); +} + class ErrorResilienceTestLargeCodecControls : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { protected: diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index cdc0a9895..c836facb3 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -107,6 +107,36 @@ void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 +#if HAVE_NEON_ASM +#if CONFIG_VP9_HIGHBITDEPTH +// No neon high bitdepth functions. +#else +void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); +} + +void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh); +} + +void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh); +} + +void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh); +} +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON_ASM + class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> { public: virtual ~Loop8Test6Param() {} @@ -594,33 +624,45 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif -#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH) +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +// No neon high bitdepth functions. +#else INSTANTIATE_TEST_CASE_P( NEON, Loop8Test6Param, ::testing::Values( #if HAVE_NEON_ASM +// Using #if inside the macro is unsupported on MSVS but the tests are not +// currently built for MSVS with ARM and NEON. make_tuple(&vp9_lpf_horizontal_16_neon, &vp9_lpf_horizontal_16_c, 8), + make_tuple(&wrapper_vertical_16_neon, + &wrapper_vertical_16_c, 8), + make_tuple(&wrapper_vertical_16_dual_neon, + &wrapper_vertical_16_dual_c, 8), + make_tuple(&vp9_lpf_horizontal_8_neon, + &vp9_lpf_horizontal_8_c, 8), + make_tuple(&vp9_lpf_vertical_8_neon, + &vp9_lpf_vertical_8_c, 8), #endif // HAVE_NEON_ASM make_tuple(&vp9_lpf_horizontal_4_neon, &vp9_lpf_horizontal_4_c, 8), - make_tuple(&vp9_lpf_horizontal_8_neon, - &vp9_lpf_horizontal_8_c, 8), make_tuple(&vp9_lpf_vertical_4_neon, - &vp9_lpf_vertical_4_c, 8), - make_tuple(&vp9_lpf_vertical_8_neon, - &vp9_lpf_vertical_8_c, 8))); + &vp9_lpf_vertical_4_c, 8))); INSTANTIATE_TEST_CASE_P( NEON, Loop8Test9Param, ::testing::Values( - make_tuple(&vp9_lpf_horizontal_4_dual_neon, - &vp9_lpf_horizontal_4_dual_c, 8), +#if HAVE_NEON_ASM make_tuple(&vp9_lpf_horizontal_8_dual_neon, &vp9_lpf_horizontal_8_dual_c, 8), - make_tuple(&vp9_lpf_vertical_4_dual_neon, - &vp9_lpf_vertical_4_dual_c, 8), make_tuple(&vp9_lpf_vertical_8_dual_neon, - &vp9_lpf_vertical_8_dual_c, 8))); -#endif // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH) + &vp9_lpf_vertical_8_dual_c, 8), +#endif // HAVE_NEON_ASM + make_tuple(&vp9_lpf_horizontal_4_dual_neon, + &vp9_lpf_horizontal_4_dual_c, 8), + make_tuple(&vp9_lpf_vertical_4_dual_neon, + &vp9_lpf_vertical_4_dual_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON } // namespace diff --git a/test/variance_test.cc b/test/variance_test.cc index 4d279f686..a8dd7de13 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1914,11 +1914,17 @@ INSTANTIATE_TEST_CASE_P( const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; +const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon; +const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon; +const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9VarianceTest, ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), make_tuple(4, 4, variance16x16_neon, 0), - make_tuple(5, 5, variance32x32_neon, 0))); + make_tuple(5, 5, variance32x32_neon, 0), + make_tuple(5, 6, variance32x64_neon, 0), + make_tuple(6, 5, variance64x32_neon, 0), + make_tuple(6, 6, variance64x64_neon, 0))); const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; @@ -1926,11 +1932,14 @@ const vp9_subpixvariance_fn_t subpel_variance16x16_neon = vp9_sub_pixel_variance16x16_neon; const vp9_subpixvariance_fn_t subpel_variance32x32_neon = vp9_sub_pixel_variance32x32_neon; +const vp9_subpixvariance_fn_t subpel_variance64x64_neon = + vp9_sub_pixel_variance64x64_neon; INSTANTIATE_TEST_CASE_P( NEON, VP9SubpelVarianceTest, ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0), make_tuple(4, 4, subpel_variance16x16_neon, 0), - make_tuple(5, 5, subpel_variance32x32_neon, 0))); + make_tuple(5, 5, subpel_variance32x32_neon, 0), + make_tuple(6, 6, subpel_variance64x64_neon, 0))); #endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER diff --git a/test/video_source.h b/test/video_source.h index 84bfa8e53..b97e1550e 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -175,8 +175,8 @@ class DummyVideoSource : public VideoSource { void SetSize(unsigned int width, unsigned int height) { if (width != width_ || height != height_) { vpx_img_free(img_); - raw_sz_ = ((width + 31)&~31) * height * 3 / 2; img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32); + raw_sz_ = ((img_->w + 31) & ~31) * img_->h * 3 / 2; width_ = width; height_ = height; } diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index fa04528a2..252ed4efa 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -165,4 +165,14 @@ INSTANTIATE_TEST_CASE_P( #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, AverageTest, + ::testing::Values( + make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon), + make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon), + make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon))); + +#endif + } // namespace diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc index 6c354fd38..34e7854a9 100644 --- a/test/vp9_encoder_parms_get_to_decoder.cc +++ b/test/vp9_encoder_parms_get_to_decoder.cc @@ -65,14 +65,15 @@ struct EncodeParameters { int32_t lossless; int32_t error_resilient; int32_t frame_parallel; + vpx_color_space_t cs; // TODO(JBB): quantizers / bitrate }; const EncodeParameters kVP9EncodeParameterSet[] = { - {0, 0, 0, 1, 0}, - {0, 0, 0, 0, 0}, - {0, 0, 1, 0, 0}, - {0, 2, 0, 0, 1}, + {0, 0, 0, 1, 0, VPX_CS_BT_601}, + {0, 0, 0, 0, 0, VPX_CS_BT_709}, + {0, 0, 1, 0, 0, VPX_CS_BT_2020}, + {0, 2, 0, 0, 1, VPX_CS_UNKNOWN}, // TODO(JBB): Test profiles (requires more work). }; @@ -109,6 +110,7 @@ class Vp9EncoderParmsGetToDecoder virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if (video->frame() == 1) { + encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, encode_parms.frame_parallel); @@ -147,7 +149,7 @@ class Vp9EncoderParmsGetToDecoder EXPECT_EQ(common->frame_parallel_decoding_mode, encode_parms.frame_parallel); } - + EXPECT_EQ(common->color_space, encode_parms.cs); EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols); EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows); @@ -80,8 +80,11 @@ The available initialization methods are: - \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif - \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif + \if encoder + - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) + - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) + . + \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index fc026aa9c..d02cd30b9 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -753,45 +753,46 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int ref_frame_map[4]; int sign_bias = 0; int dot_artifact_candidate = 0; - // For detecting dot artifact. - unsigned char* target = x->src.y_buffer; - unsigned char* target_u = x->block[16].src + *x->block[16].base_src; - unsigned char* target_v = x->block[20].src + *x->block[20].base_src; - int stride = x->src.y_stride; - int stride_uv = x->block[16].src_stride; + get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); + + // If the current frame is using LAST as a reference, check for + // biasing the mode selection for dot artifacts. + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + unsigned char* target_y = x->src.y_buffer; + unsigned char* target_u = x->block[16].src + *x->block[16].base_src; + unsigned char* target_v = x->block[20].src + *x->block[20].base_src; + int stride = x->src.y_stride; + int stride_uv = x->block[16].src_stride; #if CONFIG_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; - target = - cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset; - stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride; - if (uv_denoise) { - target_u = - cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + recon_uvoffset; - target_v = - cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + recon_uvoffset; - stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride; + if (cpi->oxcf.noise_sensitivity) { + const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; + target_y = + cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset; + stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride; + if (uv_denoise) { + target_u = + cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer + + recon_uvoffset; + target_v = + cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer + + recon_uvoffset; + stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride; + } } - } #endif - - get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset); - - dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target, stride, - plane[LAST_FRAME][0], mb_row, mb_col, 0); - // If not found in Y channel, check UV channel. - if (!dot_artifact_candidate) { dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target_u, stride_uv, - plane[LAST_FRAME][1], mb_row, mb_col, 1); + check_dot_artifact_candidate(cpi, x, target_y, stride, + plane[LAST_FRAME][0], mb_row, mb_col, 0); + // If not found in Y channel, check UV channel. if (!dot_artifact_candidate) { dot_artifact_candidate = - check_dot_artifact_candidate(cpi, x, - target_v, stride_uv, - plane[LAST_FRAME][2], mb_row, mb_col, 2); + check_dot_artifact_candidate(cpi, x, target_u, stride_uv, + plane[LAST_FRAME][1], mb_row, mb_col, 1); + if (!dot_artifact_candidate) { + dot_artifact_candidate = + check_dot_artifact_candidate(cpi, x, target_v, stride_uv, + plane[LAST_FRAME][2], mb_row, mb_col, 2); + } } } diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 6810644ba..67a0fef64 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -188,7 +188,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, /* vet via sync code */ if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a) - res = VPX_CODEC_UNSUP_BITSTREAM; + return VPX_CODEC_UNSUP_BITSTREAM; si->w = (clear[6] | (clear[7] << 8)) & 0x3fff; si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; @@ -402,7 +402,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res) { VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; - if(resolution_change) + if (resolution_change) { VP8_COMMON *const pc = & pbi->common; MACROBLOCKD *const xd = & pbi->mb; diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 09f470e97..c69ee1009 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -124,7 +124,6 @@ static INLINE void vp9_loop_filter_neon_16( return; } -#if !HAVE_NEON_ASM void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, @@ -178,47 +177,3 @@ void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, vst1q_u8(s, q8u8); return; } -#endif // !HAVE_NEON_ASM - -void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); -} - -void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); -} - -void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); -} - -#if HAVE_NEON_ASM -void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); - vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); -} -#endif // HAVE_NEON_ASM diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c new file mode 100644 index 000000000..fd9db6187 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +static INLINE void vp9_loop_filter_neon( + uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), + vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), + vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void vp9_lpf_horizontal_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void vp9_lpf_vertical_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < count; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm new file mode 100644 index 000000000..7738e0d3a --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_lpf_horizontal_4_neon| + EXPORT |vp9_lpf_vertical_4_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + cmp r12, #0 + beq end_vp9_lf_h_edge + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + +count_lf_h_loop + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl vp9_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + add r0, r0, #8 + subs r12, r12, #1 + bne count_lf_h_loop + +end_vp9_lf_h_edge + pop {pc} + ENDP ; |vp9_lpf_horizontal_4_neon| + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + cmp r12, #0 + beq end_vp9_lf_v_edge + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + +count_lf_v_loop + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl vp9_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + add r0, r0, r1, lsl #3 ; s += pitch * 8 + subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns + bne count_lf_v_loop + +end_vp9_lf_v_edge + pop {pc} + ENDP ; |vp9_lpf_vertical_4_neon| + +; void vp9_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|vp9_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon| + + END diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c new file mode 100644 index 000000000..33068a8a2 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +static INLINE void vp9_mbloop_filter_neon( + uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + + d19u8 = vcge_u8(dlimit, d19u8); + + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), + vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void vp9_lpf_horizontal_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_mblf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void vp9_lpf_vertical_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + if (count == 0) + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < count; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm index 443032217..91aaec04e 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm @@ -8,8 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_lpf_horizontal_4_neon| - EXPORT |vp9_lpf_vertical_4_neon| EXPORT |vp9_lpf_horizontal_8_neon| EXPORT |vp9_lpf_vertical_8_neon| ARM @@ -21,261 +19,6 @@ ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_lpf_horizontal_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; sp+4 int count -|vp9_lpf_horizontal_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count - ldr r2, [sp, #4] ; load thresh - add r1, r1, r1 ; double pitch - - cmp r12, #0 - beq end_vp9_lf_h_edge - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - -count_lf_h_loop - sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r3, r2, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r2@64], r1 ; p3 - vld1.u8 {d4}, [r3@64], r1 ; p2 - vld1.u8 {d5}, [r2@64], r1 ; p1 - vld1.u8 {d6}, [r3@64], r1 ; p0 - vld1.u8 {d7}, [r2@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r2@64] ; q2 - vld1.u8 {d18}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl vp9_loop_filter_neon - - vst1.u8 {d4}, [r2@64], r1 ; store op1 - vst1.u8 {d5}, [r3@64], r1 ; store op0 - vst1.u8 {d6}, [r2@64], r1 ; store oq0 - vst1.u8 {d7}, [r3@64], r1 ; store oq1 - - add r0, r0, #8 - subs r12, r12, #1 - bne count_lf_h_loop - -end_vp9_lf_h_edge - pop {pc} - ENDP ; |vp9_lpf_horizontal_4_neon| - -; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter -; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. -; -; void vp9_lpf_vertical_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; sp+4 int count -|vp9_lpf_vertical_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #4] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vp9_lf_v_edge - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - -count_lf_v_loop - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - bl vp9_loop_filter_neon - - sub r0, r0, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 - vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 - vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 - vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 - vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 - vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 - vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 - vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_lf_v_loop - -end_vp9_lf_v_edge - pop {pc} - ENDP ; |vp9_lpf_vertical_4_neon| - -; void vp9_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d4 op1 -; d5 op0 -; d6 oq0 -; d7 oq1 -|vp9_loop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d17, d6, d7 ; abs(p0 - q0) - - vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) - - vmov.u8 d18, #0x80 - - vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) - - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) - - vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - - veor d7, d7, d18 ; qs0 - - vcge.u8 d23, d1, d23 ; abs(m1) > limit - - ; filter() function - ; convert to signed - - vshr.u8 d28, d28, #1 ; a = a / 2 - veor d6, d6, d18 ; ps0 - - veor d5, d5, d18 ; ps1 - vqadd.u8 d17, d17, d28 ; a = b + a - - veor d16, d16, d18 ; qs1 - - vmov.u8 d19, #3 - - vsub.s8 d28, d7, d6 ; ( qs0 - ps0) - - vcge.u8 d17, d0, d17 ; a > blimit - - vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) - vorr d22, d21, d22 ; hevmask - - vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) - - vand d27, d27, d22 ; filter &= hev - vand d23, d23, d17 ; filter_mask - - vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d17, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d27, q12 - - vand d27, d27, d23 ; filter &= mask - - vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) - vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) - vshr.s8 d28, d28, #3 ; filter2 >>= 3 - vshr.s8 d27, d27, #3 ; filter1 >>= 3 - - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) - vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 - - veor d6, d26, d18 ; *oq0 = u^0x80 - - vbic d27, d27, d22 ; filter &= ~hev - - vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) - vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) - - veor d5, d19, d18 ; *op0 = u^0x80 - veor d4, d21, d18 ; *op1 = u^0x80 - veor d7, d20, d18 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp9_loop_filter_neon| - ; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p, ; const uint8_t *blimit, ; const uint8_t *limit, diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c index 079d26677..31fcc63ba 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c @@ -10,705 +10,49 @@ #include <arm_neon.h> +#include "./vp9_rtcd.h" #include "./vpx_config.h" - -static INLINE void vp9_loop_filter_neon( - uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), - vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), - vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; -} - -void vp9_lpf_horizontal_4_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - if (count == 0) // end_vp9_lf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - vp9_loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; -} - -void vp9_lpf_vertical_4_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - if (count == 0) // end_vp9_lf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < count; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - vp9_loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; +#include "vpx/vpx_integer.h" + +void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); } -static INLINE void vp9_mbloop_filter_neon( - uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - - d25u8 = vabd_u8(d6u8, d4u8); - - d23u8 = vmax_u8(d23u8, d24u8); - - d26u8 = vabd_u8(d7u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); - - d19u8 = vmax_u8(d19u8, d23u8); - - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); - - - d19u8 = vcge_u8(dlimit, d19u8); - - - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); - - d23u8 = vshr_n_u8(d23u8, 1); - - d25u8 = vmax_u8(d25u8, d26u8); - - d24u8 = vqadd_u8(d24u8, d23u8); - - d20u8 = vmax_u8(d20u8, d25u8); - - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - - d20u8 = vcge_u8(d23u8, d20u8); - - d19u8 = vand_u8(d19u8, d24u8); - - d23u8 = vcgt_u8(d22u8, dthresh); - - d20u8 = vand_u8(d20u8, d19u8); - - d22u8 = vdup_n_u8(0x80); - - d23u8 = vorr_u8(d21u8, d23u8); - - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), - vreinterpret_u16_u8(d21u8)); - - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); - - d27u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - q15s16 = vaddw_s8(q15s16, d29s8); - - d29u8 = vdup_n_u8(4); - - d28s8 = vqmovn_s16(q15s16); - - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); - - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } - - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - - q14u16 = vaddw_u8(q14u16, d5u8); - - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - - d30u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - - d31u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - - d23u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - - d22u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - - d6u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - - d7u8 = vqrshrn_n_u16(q14u16, 3); - - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; +#if HAVE_NEON_ASM +void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); } -void vp9_lpf_horizontal_8_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - if (count == 0) // end_vp9_mblf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; +void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); } -void vp9_lpf_vertical_8_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - if (count == 0) - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < count; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; +void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); } +#endif // HAVE_NEON_ASM diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index e7fb19fd1..124057634 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -232,6 +232,8 @@ typedef struct macroblockd { int lossless; int corrupted; + + struct vpx_internal_error_info *error_info; } MACROBLOCKD; static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 239c0494c..4eb2e6413 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -81,6 +81,7 @@ typedef struct { const vp9_prob *prob; int len; int base_val; + const int16_t *cost; } vp9_extra_bit; // indexed by token value diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 7454dd439..7938fc10a 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -99,17 +99,6 @@ typedef enum { } TX_TYPE; typedef enum { - UNKNOWN = 0, - BT_601 = 1, // YUV - BT_709 = 2, // YUV - SMPTE_170 = 3, // YUV - SMPTE_240 = 4, // YUV - BT_2020 = 5, // YUV - RESERVED_2 = 6, - SRGB = 7 // RGB -} COLOR_SPACE; - -typedef enum { VP9_LAST_FLAG = 1 << 0, VP9_GOLD_FLAG = 1 << 1, VP9_ALT_FLAG = 1 << 2, diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 1a3fefc5f..b48d52230 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -1517,12 +1517,12 @@ void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 output[0] = WRAPLOW(step[0] + step[3], bd); @@ -1562,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1587,12 +1588,12 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 & stage 3 - even half vp9_highbd_idct4(step1, step1, bd); @@ -1607,8 +1608,8 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; // stage 4 @@ -1653,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -1696,10 +1698,10 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd); - output[2] = WRAPLOW(dct_const_round_shift(s2), bd); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); + output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); + output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); + output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); } void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1764,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd); + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); // stage 2 s0 = x0; @@ -1787,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { x1 = WRAPLOW(s1 + s3, bd); x2 = WRAPLOW(s0 - s2, bd); x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1798,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = WRAPLOW(dct_const_round_shift(s2), bd); - x3 = WRAPLOW(dct_const_round_shift(s3), bd); - x6 = WRAPLOW(dct_const_round_shift(s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s7), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); output[0] = WRAPLOW(x0, bd); output[1] = WRAPLOW(-x4, bd); @@ -1910,23 +1912,23 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 3 step1[0] = step2[0]; @@ -1936,12 +1938,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[8] = WRAPLOW(step2[8] + step2[9], bd); step1[9] = WRAPLOW(step2[8] - step2[9], bd); @@ -1955,12 +1957,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[4] = WRAPLOW(step1[4] + step1[5], bd); step2[5] = WRAPLOW(step1[4] - step1[5], bd); step2[6] = WRAPLOW(-step1[6] + step1[7], bd); @@ -1970,12 +1972,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -1987,8 +1989,8 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; step1[8] = WRAPLOW(step2[8] + step2[11], bd); @@ -2013,12 +2015,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2115,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd); + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); // stage 2 s0 = x0; @@ -2158,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, x5 = WRAPLOW(s1 - s5, bd); x6 = WRAPLOW(s2 - s6, bd); x7 = WRAPLOW(s3 - s7, bd); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); // stage 3 s0 = x0; @@ -2189,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, x1 = WRAPLOW(s1 + s3, bd); x2 = WRAPLOW(s0 - s2, bd); x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); x8 = WRAPLOW(s8 + s10, bd); x9 = WRAPLOW(s9 + s11, bd); x10 = WRAPLOW(s8 - s10, bd); x11 = WRAPLOW(s9 - s11, bd); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -2212,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(dct_const_round_shift(s2), bd); - x3 = WRAPLOW(dct_const_round_shift(s3), bd); - x6 = WRAPLOW(dct_const_round_shift(s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s7), bd); - x10 = WRAPLOW(dct_const_round_shift(s10), bd); - x11 = WRAPLOW(dct_const_round_shift(s11), bd); - x14 = WRAPLOW(dct_const_round_shift(s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s15), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); output[0] = WRAPLOW(x0, bd); output[1] = WRAPLOW(-x8, bd); @@ -2306,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -2343,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 step2[0] = step1[0]; @@ -2393,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[16] = WRAPLOW(step1[16] + step1[17], bd); step2[17] = WRAPLOW(step1[16] - step1[17], bd); @@ -2436,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[8] = WRAPLOW(step2[8] + step2[9], bd); step1[9] = WRAPLOW(step2[8] - step2[9], bd); @@ -2456,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2480,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[4] = WRAPLOW(step1[4] + step1[5], bd); step2[5] = WRAPLOW(step1[4] - step1[5], bd); step2[6] = WRAPLOW(-step1[6] + step1[7], bd); @@ -2495,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -2530,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; step1[8] = WRAPLOW(step2[8] + step2[11], bd); @@ -2547,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2581,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2632,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; @@ -2759,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 1d8836cf3..6e2551dd4 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -80,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377; static const tran_high_t sinpi_4_9 = 15212; static INLINE tran_low_t check_range(tran_high_t input) { -#if CONFIG_VP9_HIGHBITDEPTH - // For valid highbitdepth VP9 streams, intermediate stage coefficients will - // stay within the ranges: - // - 8 bit: signed 16 bit integer - // - 10 bit: signed 18 bit integer - // - 12 bit: signed 20 bit integer -#elif CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid VP9 input streams, intermediate stage coefficients should always // stay within the range of a signed 16 bit integer. Coefficients can go out // of this range for invalid/corrupt VP9 streams. However, strictly checking @@ -95,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) { // --enable-coefficient-range-checking. assert(INT16_MIN <= input); assert(input <= INT16_MAX); -#endif +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING return (tran_low_t)input; } @@ -104,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { return check_range(rv); } +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_low_t highbd_check_range(tran_high_t input, + int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void) int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void) bd; + return (tran_low_t)input; +} + +static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, + int bd) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return highbd_check_range(rv, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + typedef void (*transform_1d)(const tran_low_t*, tran_low_t*); typedef struct { diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 58b2da75f..2101ec58c 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, } #endif // CONFIG_VP9_HIGHBITDEPTH -static void filter_block_plane_non420(VP9_COMMON *cm, - struct macroblockd_plane *plane, - MODE_INFO *mi_8x8, - int mi_row, int mi_col) { +void vp9_filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col) { const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; @@ -1598,8 +1598,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, if (use_420) vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); else - filter_block_plane_non420(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } } } diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 4c15e6bd4..6d7cabf7c 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -97,6 +97,11 @@ void vp9_filter_block_plane(struct VP9Common *const cm, int mi_row, LOOP_FILTER_MASK *lfm); +void vp9_filter_block_plane_non420(struct VP9Common *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col); + void vp9_loop_filter_init(struct VP9Common *cm); // Update the loop filter for the current frame. diff --git a/vp9/decoder/vp9_dthread.c b/vp9/common/vp9_loopfilter_thread.c index 3d2d0dd2e..2d47daeaf 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/common/vp9_loopfilter_thread.c @@ -9,14 +9,10 @@ */ #include "./vpx_config.h" - #include "vpx_mem/vpx_mem.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - #if CONFIG_MULTITHREAD static INLINE void mutex_lock(pthread_mutex_t *const mutex) { const int kMaxTryLocks = 4000; @@ -88,31 +84,43 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } // Implement row loopfiltering for each thread. -static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, - VP9_COMMON *const cm, - struct macroblockd_plane planes[MAX_MB_PLANE], - int start, int stop, int y_only, - VP9LfSync *const lf_sync) { +static INLINE +void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int r, c; // SB row and col + const int use_420 = y_only || (planes[1].subsampling_y == 1 && + planes[1].subsampling_x == 1); const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + int mi_row, mi_col; - for (r = start; r < stop; r += lf_sync->num_workers) { - const int mi_row = r << MI_BLOCK_SIZE_LOG2; + for (mi_row = start; mi_row < stop; + mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; - for (c = 0; c < sb_cols; ++c) { - const int mi_col = c << MI_BLOCK_SIZE_LOG2; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; LOOP_FILTER_MASK lfm; int plane; sync_read(lf_sync, r, c); vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); - vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); + + // TODO(JBB): Make setup_mask work for non 420. + if (use_420) + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, + &lfm); for (plane = 0; plane < num_planes; ++plane) { - vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); + if (use_420) + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); + else + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } sync_write(lf_sync, r, c, sb_cols); @@ -123,37 +131,33 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, // Row-based multi-threaded loopfilter hook static int loop_filter_row_worker(VP9LfSync *const lf_sync, LFWorkerData *const lf_data) { - loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, - lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); return 1; } -// VP9 decoder: Implement multi-threaded loopfilter that uses the tile -// threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], - VP9_COMMON *cm, - VP9Worker *workers, int nworkers, - int frame_filter_level, - int y_only) { +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9Worker *workers, int nworkers, + VP9LfSync *lf_sync) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + // Decoder may allocate more threads than number of tiles based on user's + // input. const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = MIN(nworkers, tile_cols); int i; - if (!frame_filter_level) return; - if (!lf_sync->sync_range || cm->last_height != cm->height || num_workers > lf_sync->num_workers) { vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } - vp9_loop_filter_frame_init(cm, frame_filter_level); - // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -175,8 +179,8 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, // Loopfilter data vp9_loop_filter_data_reset(lf_data, frame, cm, planes); - lf_data->start = i; - lf_data->stop = sb_rows; + lf_data->start = start + i * MI_BLOCK_SIZE; + lf_data->stop = stop; lf_data->y_only = y_only; // Start loopfiltering @@ -193,8 +197,33 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, } } +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + if (!frame_filter_level) return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); + + loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, + y_only, workers, num_workers, lf_sync); +} + // Set up nsync by width. -static int get_sync_range(int width) { +static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) diff --git a/vp9/decoder/vp9_dthread.h b/vp9/common/vp9_loopfilter_thread.h index d5810b45b..bca357e52 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/common/vp9_loopfilter_thread.h @@ -8,21 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - +#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ +#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ #include "./vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_thread.h" -#include "vp9/decoder/vp9_reader.h" struct VP9Common; -struct VP9Decoder; - -typedef struct TileWorkerData { - struct VP9Common *cm; - vp9_reader bit_reader; - DECLARE_ALIGNED(16, struct macroblockd, xd); -} TileWorkerData; // Loopfilter row synchronization typedef struct VP9LfSyncData { @@ -43,19 +35,19 @@ typedef struct VP9LfSyncData { } VP9LfSync; // Allocate memory for loopfilter row synchronization. -void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows, int width, int num_workers); // Deallocate loopfilter synchronization related mutex and data. void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - VP9Worker *workers, int num_workers, + struct macroblockd_plane planes[MAX_MB_PLANE], int frame_filter_level, - int y_only); + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync); -#endif // VP9_DECODER_VP9_DTHREAD_H_ +#endif // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index 92650e954..e7ee903c6 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -136,13 +136,27 @@ static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, } } +static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) { + const int adj = qdiff >> MFQE_PRECISION; + if (bs == BLOCK_16X16) { + *sad_thr = 7 + adj; + } else if (bs == BLOCK_32X32) { + *sad_thr = 6 + adj; + } else { // BLOCK_64X64 + *sad_thr = 5 + adj; + } + *vdiff_thr = 125 + qdiff; +} + static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, const uint8_t *v, int y_stride, int uv_stride, - uint8_t *yd, uint8_t *ud, uint8_t *vd, - int yd_stride, int uvd_stride) { - int sad, sad_thr, vdiff; + uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride, + int uvd_stride, int qdiff) { + int sad, sad_thr, vdiff, vdiff_thr; uint32_t sse; + get_thr(bs, qdiff, &sad_thr, &vdiff_thr); + if (bs == BLOCK_16X16) { vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; @@ -154,23 +168,18 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; } - if (bs == BLOCK_16X16) { - sad_thr = 8; - } else if (bs == BLOCK_32X32) { - sad_thr = 7; - } else { // BLOCK_64X64 - sad_thr = 6; - } - - // TODO(jackychen): More experiments and remove magic numbers. // vdiff > sad * 3 means vdiff should not be too small, otherwise, // it might be a lighting change in smooth area. When there is a // lighting change in smooth area, it is dangerous to do MFQE. - if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) { - // TODO(jackychen): Add weighted average in the calculation. - // Currently, the data is copied from last frame without averaging. - apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, - ud, vd, uvd_stride, bs, 0); + if (sad > 1 && vdiff > sad * 3) { + const int weight = 1 << MFQE_PRECISION; + int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr); + // When ifactor equals weight, no MFQE is done. + if (ifactor > weight) { + ifactor = weight; + } + apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, + uvd_stride, bs, ifactor); } else { // Copy the block from current frame (i.e., no mfqe is done). copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, @@ -199,8 +208,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, int yd_stride, int uvd_stride) { int mi_offset, y_offset, uv_offset; const BLOCK_SIZE cur_bs = mi->mbmi.sb_type; - // TODO(jackychen): Consider how and whether to use qdiff in MFQE. - // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex; + const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex; const int bsl = b_width_log2_lookup[bs]; PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; const BLOCK_SIZE subsize = get_subsize(bs, partition); @@ -235,18 +243,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, if (mfqe_decision(mi, mfqe_bs)) { // Do mfqe on the first square partition. mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, - yd, ud, vd, yd_stride, uvd_stride); + yd, ud, vd, yd_stride, uvd_stride, qdiff); // Do mfqe on the second square partition. mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride, uv_stride, yd + y_offset, ud + uv_offset, - vd + uv_offset, yd_stride, uvd_stride); + vd + uv_offset, yd_stride, uvd_stride, qdiff); } if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) { // Do mfqe on the first square partition. mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, v + uv_offset * uv_stride, y_stride, uv_stride, yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, - vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); // Do mfqe on the second square partition. mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, u + uv_offset * uv_stride + uv_offset, @@ -254,7 +262,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, uv_stride, yd + y_offset * yd_stride + y_offset, ud + uv_offset * uvd_stride + uv_offset, vd + uv_offset * uvd_stride + uv_offset, - yd_stride, uvd_stride); + yd_stride, uvd_stride, qdiff); } break; case PARTITION_VERT: @@ -268,18 +276,18 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, if (mfqe_decision(mi, mfqe_bs)) { // Do mfqe on the first square partition. mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride, - yd, ud, vd, yd_stride, uvd_stride); + yd, ud, vd, yd_stride, uvd_stride, qdiff); // Do mfqe on the second square partition. mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride, v + uv_offset * uv_stride, y_stride, uv_stride, yd + y_offset * yd_stride, ud + uv_offset * uvd_stride, - vd + uv_offset * uvd_stride, yd_stride, uvd_stride); + vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff); } if (mfqe_decision(mi + mi_offset, mfqe_bs)) { // Do mfqe on the first square partition. mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset, y_stride, uv_stride, yd + y_offset, ud + uv_offset, - vd + uv_offset, yd_stride, uvd_stride); + vd + uv_offset, yd_stride, uvd_stride, qdiff); // Do mfqe on the second square partition. mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset, u + uv_offset * uv_stride + uv_offset, @@ -287,14 +295,14 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, uv_stride, yd + y_offset * yd_stride + y_offset, ud + uv_offset * uvd_stride + uv_offset, vd + uv_offset * uvd_stride + uv_offset, - yd_stride, uvd_stride); + yd_stride, uvd_stride, qdiff); } break; case PARTITION_NONE: if (mfqe_decision(mi, cur_bs)) { // Do mfqe on this partition. mfqe_block(cur_bs, y, u, v, y_stride, uv_stride, - yd, ud, vd, yd_stride, uvd_stride); + yd, ud, vd, yd_stride, uvd_stride, qdiff); } else { // Copy the block from current frame(i.e., no mfqe is done). copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd, diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index ad91c10dd..1a957bc99 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -76,7 +76,7 @@ typedef struct VP9Common { DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); - COLOR_SPACE color_space; + vpx_color_space_t color_space; int width; int height; @@ -272,6 +272,7 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { xd->above_seg_context = cm->above_seg_context; xd->mi_stride = cm->mi_stride; + xd->error_info = &cm->error; } static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d2ab875e9..4e9ec0f56 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -224,10 +224,12 @@ specialize qw/vp9_lpf_vertical_16_dual sse2 neon_asm dspr2/; $vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon; add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/; +$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon; add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/; +$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon; add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/; @@ -240,10 +242,12 @@ specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/; $vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon; add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; -specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/; +$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon; add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/; +specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/; +$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon; add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"; specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/; @@ -794,16 +798,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64/, "$sse2_x86inc"; +specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; @@ -833,7 +837,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_ specialize qw/vp9_variance4x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -1094,7 +1098,7 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; -specialize qw/vp9_avg_8x8 sse2/; +specialize qw/vp9_avg_8x8 sse2 neon/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_avg_4x4 sse2/; diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c index 8c4a30353..7a20e0a9e 100644 --- a/vp9/common/vp9_tile_common.c +++ b/vp9/common/vp9_tile_common.c @@ -36,24 +36,24 @@ void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { vp9_tile_set_col(tile, cm, col); } -void vp9_get_tile_n_bits(int mi_cols, - int *min_log2_tile_cols, int *max_log2_tile_cols) { - const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; - int min_log2 = 0, max_log2 = 0; - - // max - while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64) - ++max_log2; - --max_log2; - if (max_log2 < 0) - max_log2 = 0; - - // min - while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols) +static int get_min_log2_tile_cols(const int sb64_cols) { + int min_log2 = 0; + while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols) ++min_log2; + return min_log2; +} - assert(min_log2 <= max_log2); +static int get_max_log2_tile_cols(const int sb64_cols) { + int max_log2 = 1; + while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64) + ++max_log2; + return max_log2 - 1; +} - *min_log2_tile_cols = min_log2; - *max_log2_tile_cols = max_log2; +void vp9_get_tile_n_bits(int mi_cols, + int *min_log2_tile_cols, int *max_log2_tile_cols) { + const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; + *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols); + *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols); + assert(*min_log2_tile_cols <= *max_log2_tile_cols); } diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 9677173db..ea4edbffe 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -36,7 +36,6 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" -#include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_reader.h" @@ -463,8 +462,8 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, subsize = get_subsize(bsize, partition); uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y]; if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid block size."); + vpx_internal_error(xd->error_info, + VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); if (subsize < BLOCK_8X8) { decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); } else { @@ -727,6 +726,8 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { } cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; + cm->frame_bufs[cm->new_fb_idx].buf.color_space = + (vpx_color_space_t)cm->color_space; cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; } @@ -781,7 +782,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, cm->subsampling_x, cm->subsampling_y)) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Referenced frame has incompatible color space"); + "Referenced frame has incompatible color format"); } resize_context_buffers(cm, width, height); @@ -1021,6 +1022,15 @@ static int tile_worker_hook(TileWorkerData *const tile_data, const TileInfo *const tile) { int mi_row, mi_col; + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + tile_data->xd.corrupted = 1; + return 0; + } + + tile_data->error_info.setjmp = 1; + tile_data->xd.error_info = &tile_data->error_info; + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { vp9_zero(tile_data->xd.left_context); @@ -1167,6 +1177,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, for (; i > 0; --i) { VP9Worker *const worker = &pbi->tile_workers[i - 1]; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. pbi->mb.corrupted |= !winterface->sync(worker); } if (final_worker > -1) { @@ -1212,8 +1226,8 @@ static void read_bitdepth_colorspace_sampling( cm->use_highbitdepth = 0; #endif } - cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); - if (cm->color_space != SRGB) { + cm->color_space = vp9_rb_read_literal(rb, 3); + if (cm->color_space != VPX_CS_SRGB) { vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { cm->subsampling_x = vp9_rb_read_bit(rb); @@ -1311,9 +1325,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, } else { // NOTE: The intra-only frame header does not include the specification // of either the color format or color sub-sampling in profile 0. VP9 - // specifies that the default color space should be YUV 4:2:0 in this + // specifies that the default color format should be YUV 4:2:0 in this // case (normative). - cm->color_space = BT_601; + cm->color_space = VPX_CS_BT_601; cm->subsampling_y = cm->subsampling_x = 1; cm->bit_depth = VPX_BITS_8; #if CONFIG_VP9_HIGHBITDEPTH @@ -1546,8 +1560,6 @@ void vp9_decode_frame(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - init_macroblockd(cm, &pbi->mb); - cm->use_prev_frame_mvs = !cm->error_resilient_mode && cm->width == cm->last_width && cm->height == cm->last_height && @@ -1578,9 +1590,9 @@ void vp9_decode_frame(VP9Decoder *pbi, if (!xd->corrupted) { // If multiple threads are used to decode tiles, then we use those threads // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm, - pbi->tile_workers, pbi->num_tile_workers, - cm->lf.filter_level, 0); + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, + 0, 0, pbi->tile_workers, pbi->num_tile_workers, + &pbi->lf_row_sync); } else { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index cff94db2d..1c2603b0a 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -438,7 +438,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME]; xd->block_refs[ref] = ref_buf; if ((!vp9_is_valid_scale(&ref_buf->sf))) - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf); @@ -451,7 +451,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; if (bsize < BLOCK_8X8) { - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, "Invalid usage of segement feature on small blocks"); return; } diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 1406b4034..7bef265b8 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -32,7 +32,6 @@ #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_detokenize.h" -#include "vp9/decoder/vp9_dthread.h" static void initialize_dec(void) { static volatile int init_done = 0; @@ -63,8 +62,8 @@ static void vp9_dec_free_mi(VP9_COMMON *cm) { } VP9Decoder *vp9_decoder_create() { - VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi)); - VP9_COMMON *const cm = pbi ? &pbi->common : NULL; + VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi)); + VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL; if (!cm) return NULL; @@ -238,12 +237,12 @@ static void swap_frame_buffers(VP9Decoder *pbi) { // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) - cm->frame_refs[ref_index].idx = INT_MAX; + cm->frame_refs[ref_index].idx = -1; } int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, const uint8_t **psource) { - VP9_COMMON *const cm = &pbi->common; + VP9_COMMON *volatile const cm = &pbi->common; const uint8_t *source = *psource; int retcode = 0; @@ -258,7 +257,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. - if (cm->frame_refs[0].idx != INT_MAX) + if (cm->frame_refs[0].idx > 0) cm->frame_refs[0].buf->corrupted = 1; } diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 25b7339ed..1415019a1 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -15,12 +15,11 @@ #include "vpx/vpx_codec.h" #include "vpx_scale/yv12config.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_thread.h" - -#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_reader.h" #ifdef __cplusplus extern "C" { @@ -33,6 +32,13 @@ typedef struct TileData { DECLARE_ALIGNED(16, MACROBLOCKD, xd); } TileData; +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + struct vpx_internal_error_info error_info; +} TileWorkerData; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 8704fddac..23d622d70 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -14,6 +14,9 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#include "vp9/common/vp9_idct.h" +#endif #include "vp9/decoder/vp9_detokenize.h" @@ -32,7 +35,7 @@ #define INCREMENT_COUNT(token) \ do { \ if (!cm->frame_parallel_decoding_mode) \ - ++coef_counts[band][ctx][token]; \ + ++coef_counts[band][ctx][token]; \ } while (0) static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) { @@ -191,10 +194,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, } v = (val * dqv) >> dq_shift; #if CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v), + cm->bit_depth); +#else dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v); +#endif // CONFIG_VP9_HIGHBITDEPTH #else dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; -#endif +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING token_cache[scan[c]] = vp9_pt_energy_class[token]; ++c; ctx = get_coef_context(nb, token_cache, c); diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c new file mode 100644 index 000000000..f505fcb7a --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index 816fbda1f..b1ad83731 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,6 +10,7 @@ #include <arm_neon.h> #include "./vp9_rtcd.h" +#include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -28,6 +29,9 @@ enum { kHeight16PlusOne = 17 }; enum { kWidth32 = 32 }; enum { kHeight32 = 32 }; enum { kHeight32PlusOne = 33 }; +enum { kWidth64 = 64 }; +enum { kHeight64 = 64 }; +enum { kHeight64PlusOne = 65 }; enum { kPixelStepOne = 1 }; enum { kAlign16 = 16 }; @@ -46,9 +50,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { return vget_lane_s32(c, 0); } +// w * h must be less than 2048 or local variable v_sum may overflow. static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); @@ -88,7 +93,7 @@ unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 } void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, @@ -103,7 +108,7 @@ unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 } static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, @@ -205,7 +210,62 @@ unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, unsigned int *sse) { int sum; variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); - return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, &sse1, &sum1); + variance_neon_w8(a + (kHeight32 * a_stride), a_stride, + b + (kHeight32 * b_stride), b_stride, kWidth32, kHeight32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1); + variance_neon_w8(a + (kHeight16 * a_stride), a_stride, + b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, kWidth64, kHeight16, &sse1, &sum1); + variance_neon_w8(a + (kHeight16 * a_stride), a_stride, + b + (kHeight16 * b_stride), b_stride, kWidth64, kHeight16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (kHeight16 * 2 * a_stride), a_stride, + b + (kHeight16 * 2 * b_stride), b_stride, + kWidth64, kHeight16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (kHeight16 * 3 * a_stride), a_stride, + b + (kHeight16 * 3 * b_stride), b_stride, + kWidth64, kHeight16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 } unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, @@ -225,3 +285,21 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); } + +unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight64 * kWidth64); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight64PlusOne * kWidth64); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight64PlusOne, kWidth64, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth64, kWidth64, kHeight64, + kWidth64, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance64x64_neon(temp2, kWidth64, dst, dst_stride, sse); +} diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 752429c8f..3f4ed94d6 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -297,7 +297,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { write_inter_mode(w, mode, inter_probs); - ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; } } @@ -320,7 +319,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, const int j = idy * 2 + idx; const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; write_inter_mode(w, b_mode, inter_probs); - ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; if (b_mode == NEWMV) { for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, @@ -1057,7 +1055,7 @@ static void write_bitdepth_colorspace_sampling( vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1); } vp9_wb_write_literal(wb, cm->color_space, 3); - if (cm->color_space != SRGB) { + if (cm->color_space != VPX_CS_SRGB) { vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); @@ -1172,8 +1170,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i], counts->inter_mode[i], INTER_MODES, &header_bc); - vp9_zero(counts->inter_mode); - if (cm->interp_filter == SWITCHABLE) update_switchable_interp_probs(cm, &header_bc, counts); diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index 506f6de84..41f72f89b 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -170,7 +170,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, vp9_fdct4x4_c(input, output, stride); } else { tran_low_t out[4 * 4]; - tran_low_t *outptr = &out[0]; int i, j; tran_low_t temp_in[4], temp_out[4]; const transform_2d ht = FHT_4[tx_type]; @@ -183,7 +182,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, temp_in[0] += 1; ht.cols(temp_in, temp_out); for (j = 0; j < 4; ++j) - outptr[j * 4 + i] = temp_out[j]; + out[j * 4 + i] = temp_out[j]; } // Rows @@ -711,7 +710,6 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, vp9_fdct8x8_c(input, output, stride); } else { tran_low_t out[64]; - tran_low_t *outptr = &out[0]; int i, j; tran_low_t temp_in[8], temp_out[8]; const transform_2d ht = FHT_8[tx_type]; @@ -722,7 +720,7 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) - outptr[j * 8 + i] = temp_out[j]; + out[j * 8 + i] = temp_out[j]; } // Rows @@ -1103,7 +1101,6 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, vp9_fdct16x16_c(input, output, stride); } else { tran_low_t out[256]; - tran_low_t *outptr = &out[0]; int i, j; tran_low_t temp_in[16], temp_out[16]; const transform_2d ht = FHT_16[tx_type]; @@ -1114,7 +1111,7 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, temp_in[j] = input[j * stride + i] * 4; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) - outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } // Rows diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 56ec6b335..a7aaff0cf 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -49,9 +49,7 @@ static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { } static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { - return (4 << b_width_log2_lookup[bs]) * - (4 << b_height_log2_lookup[bs]) * - (increase_denoising ? 60 : 40); + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40); } static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, @@ -60,19 +58,16 @@ static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, noise_motion_thresh(bs, increase_denoising)) { return 0; } else { - return (4 << b_width_log2_lookup[bs]) * - (4 << b_height_log2_lookup[bs]) * 20; + return (1 << num_pels_log2_lookup[bs]) * 20; } } int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { - return (4 << b_width_log2_lookup[bs]) * - (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2); + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { - return (4 << b_width_log2_lookup[bs]) * - (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2); + return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2); } // TODO(jackychen): If increase_denoising is enabled in the future, @@ -195,16 +190,6 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, return framebuf + (stride * mi_row * 8) + (mi_col * 8); } -static void copy_block(uint8_t *dest, int dest_stride, - const uint8_t *src, int src_stride, BLOCK_SIZE bs) { - int r; - for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) { - vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs])); - dest += dest_stride; - src += src_stride; - } -} - static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, @@ -219,28 +204,18 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, MV_REFERENCE_FRAME frame; MACROBLOCKD *filter_mbd = &mb->e_mbd; MB_MODE_INFO *mbmi = &filter_mbd->mi[0].src_mi->mbmi; - MB_MODE_INFO saved_mbmi; int i, j; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE][2]; // 2 pre buffers - // We will restore these after motion compensation. - saved_mbmi = *mbmi; - for (i = 0; i < MAX_MB_PLANE; ++i) { - for (j = 0; j < 2; ++j) { - saved_pre[i][j] = filter_mbd->plane[i].pre[j]; - } - saved_dst[i] = filter_mbd->plane[i].dst; - } - mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; - *motion_magnitude = mv_row * mv_row + mv_col * mv_col; - frame = ctx->best_reference_frame; + saved_mbmi = *mbmi; + // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. if (frame != INTRA_FRAME && @@ -261,6 +236,26 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, ctx->newmv_sse = ctx->zeromv_sse; } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + // Restore everything to its original state + *mbmi = saved_mbmi; + return COPY_BLOCK; + } + if (mv_row * mv_row + mv_col * mv_col > + 8 * noise_motion_thresh(bs, increase_denoising)) { + // Restore everything to its original state + *mbmi = saved_mbmi; + return COPY_BLOCK; + } + + // We will restore these after motion compensation. + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (j = 0; j < 2; ++j) { + saved_pre[i][j] = filter_mbd->plane[i].pre[j]; + } + saved_dst[i] = filter_mbd->plane[i].dst; + } + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser // struct. for (j = 0; j < 2; ++j) { @@ -313,13 +308,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, mv_row = ctx->best_sse_mv.as_mv.row; mv_col = ctx->best_sse_mv.as_mv.col; - if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { - return COPY_BLOCK; - } - if (mv_row * mv_row + mv_col * mv_col > - 8 * noise_motion_thresh(bs, increase_denoising)) { - return COPY_BLOCK; - } return FILTER_BLOCK; } @@ -348,9 +336,15 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, } if (decision == FILTER_BLOCK) { - copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs); + vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, + NULL, 0, NULL, 0, + num_4x4_blocks_wide_lookup[bs] << 2, + num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs); + vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, + NULL, 0, NULL, 0, + num_4x4_blocks_wide_lookup[bs] << 2, + num_4x4_blocks_high_lookup[bs] << 2); } } @@ -358,6 +352,7 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { int r; const uint8_t *srcbuf = src.y_buffer; uint8_t *destbuf = dest.y_buffer; + assert(dest.y_width == src.y_width); assert(dest.y_height == src.y_height); @@ -368,6 +363,15 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { } } +static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest, + YV12_BUFFER_CONFIG *src) { + uint8_t *tmp_buf = dest->y_buffer; + assert(dest->y_width == src->y_width); + assert(dest->y_height == src->y_height); + dest->y_buffer = src->y_buffer; + src->y_buffer = tmp_buf; +} + void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, @@ -377,28 +381,32 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, if (frame_type == KEY_FRAME) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME - for (i = 1; i < MAX_REF_FRAMES; ++i) { + for (i = 1; i < MAX_REF_FRAMES; ++i) copy_frame(denoiser->running_avg_y[i], src); - } - } else { /* For non key frames */ - if (refresh_alt_ref_frame) { - copy_frame(denoiser->running_avg_y[ALTREF_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_golden_frame) { - copy_frame(denoiser->running_avg_y[GOLDEN_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_last_frame) { - copy_frame(denoiser->running_avg_y[LAST_FRAME], - denoiser->running_avg_y[INTRA_FRAME]); - } + return; + } + + /* For non key frames */ + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME], + &denoiser->running_avg_y[INTRA_FRAME]); } } void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->zeromv_sse = UINT_MAX; - ctx->newmv_sse = UINT_MAX; + // This should be initialized as zero since mode search stage might skip + // NEWMV mode if inferred motion vector modes provide sufficiently good + // prediction quality. + ctx->newmv_sse = 0; } void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, @@ -458,12 +466,14 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, make_grayscale(&denoiser->running_avg_y[i]); #endif denoiser->increase_denoising = 0; + denoiser->frame_buffer_initialized = 1; return 0; } void vp9_denoiser_free(VP9_DENOISER *denoiser) { int i; + denoiser->frame_buffer_initialized = 0; if (denoiser == NULL) { return; } @@ -483,15 +493,13 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { uint8_t *u = yuv->u_buffer; uint8_t *v = yuv->v_buffer; - // The '/2's are there because we have a 440 buffer, but we want to output - // 420. - for (r = 0; r < yuv->uv_height / 2; ++r) { - for (c = 0; c < yuv->uv_width / 2; ++c) { + for (r = 0; r < yuv->uv_height; ++r) { + for (c = 0; c < yuv->uv_width; ++c) { u[c] = UINT8_MAX / 2; v[c] = UINT8_MAX / 2; } - u += yuv->uv_stride + yuv->uv_width / 2; - v += yuv->uv_stride + yuv->uv_width / 2; + u += yuv->uv_stride; + v += yuv->uv_stride; } } #endif diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index 421dfcd0c..8eb5da1b8 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -29,6 +29,7 @@ typedef struct vp9_denoiser { YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG mc_running_avg_y; int increase_denoising; + int frame_buffer_initialized; } VP9_DENOISER; void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 4c948237d..756052771 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -55,9 +55,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData * td, int mi_row, int mi_col, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); -// Motion vector component magnitude threshold for defining fast motion. -#define FAST_MOTION_MV_THRESH 24 - // This is used as a reference when computing the source variance for the // purposes of activity masking. // Eventually this should be replaced by custom no-reference routines, @@ -1010,22 +1007,20 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { const MACROBLOCKD *const xd = &x->e_mbd; const MODE_INFO *const mi = xd->mi[0].src_mi; const MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; if (!frame_is_intra_only(cm)) { + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) { - FRAME_COUNTS *const counts = td->counts; - const int inter_block = is_inter_block(mbmi); - counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; - // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (inter_block) { const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; - if (cm->reference_mode == REFERENCE_MODE_SELECT) counts->comp_inter[vp9_get_reference_mode_context(cm, xd)] [has_second_ref(mbmi)]++; @@ -1042,6 +1037,25 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { } } } + if (inter_block && + !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; + if (bsize >= BLOCK_8X8) { + const PREDICTION_MODE mode = mbmi->mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; + } + } + } + } } } @@ -1410,6 +1424,11 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, const int pred_ctx = vp9_get_pred_context_switchable_interp(xd); ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter]; } + + if (mbmi->sb_type < BLOCK_8X8) { + mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; + } } if (cm->use_prev_frame_mvs) { @@ -2705,9 +2724,12 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); - else + else if (bsize >= BLOCK_8X8) vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); + else + vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col, + rd_cost, bsize, ctx); duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); @@ -3219,7 +3241,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, pc_tree->vertical[0].skip = x->skip; encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, subsize, &pc_tree->vertical[0]); - if (mi_col + hbs < cm->mi_cols) { + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { pc_tree->vertical[1].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, dummy_cost, subsize, &pc_tree->vertical[1]); @@ -3240,7 +3262,7 @@ static void nonrd_use_partition(VP9_COMP *cpi, encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, subsize, &pc_tree->horizontal[0]); - if (mi_row + hbs < cm->mi_rows) { + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { pc_tree->horizontal[1].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, dummy_cost, subsize, &pc_tree->horizontal[1]); @@ -3312,9 +3334,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, // Set the partition type of the 64X64 block switch (sf->partition_search_type) { case VAR_BASED_PARTITION: - // TODO(jingning) Only key frame coding supports sub8x8 block at this - // point. To be continued to enable sub8x8 block mode decision for - // P frames. + // TODO(jingning, marpan): The mode decision and encoding process + // support both intra and inter sub8x8 block coding for RTC mode. + // Tune the thresholds accordingly to use sub8x8 block coding for + // coding performance improvement. choose_partitioning(cpi, tile_info, x, mi_row, mi_col); nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 9c29eb438..70b804e31 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -93,7 +93,7 @@ typedef struct vp9_token_state { int rate; int error; int next; - signed char token; + int16_t token; short qc; } vp9_token_state; @@ -147,9 +147,15 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, int next = eob, sz = 0; int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; - int rate0, rate1, error0, error1, t0, t1; + int rate0, rate1, error0, error1; + int16_t t0, t1; + EXTRABIT e0; int best, band, pt, i, final_eob; - const int16_t *dct_value_cost; +#if CONFIG_VP9_HIGHBITDEPTH + const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); +#else + const int16_t *cat6_high_cost = vp9_get_high_cost_table(8); +#endif assert((!type && !plane) || (type && plane)); assert(eob <= default_eob); @@ -166,17 +172,6 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, tokens[eob][0].qc = 0; tokens[eob][1] = tokens[eob][0]; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->bd == 12) { - dct_value_cost = vp9_dct_value_cost_high12_ptr; - } else if (xd->bd == 10) { - dct_value_cost = vp9_dct_value_cost_high10_ptr; - } else { - dct_value_cost = vp9_dct_value_cost_ptr; - } -#else - dct_value_cost = vp9_dct_value_cost_ptr; -#endif for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])]; @@ -193,7 +188,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, /* Evaluate the first possibility for this state. */ rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - t0 = vp9_get_token(x); + vp9_get_token_extra(x, &t0, &e0); /* Consider both possible successor states. */ if (next < default_eob) { band = band_translate[i + 1]; @@ -206,7 +201,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = dct_value_cost[x]; + base_bits = vp9_get_cost(t0, e0, cat6_high_cost); dx = mul * (dqcoeff[rc] - coeff[rc]); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -244,8 +239,10 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, */ t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; + e0 = 0; } else { - t0 = t1 = vp9_get_token(x); + vp9_get_token_extra(x, &t0, &e0); + t1 = t0; } if (next < default_eob) { band = band_translate[i + 1]; @@ -264,7 +261,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = dct_value_cost[x]; + base_bits = vp9_get_cost(t0, e0, cat6_high_cost); if (shortcut) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 430ad17e0..35fea57f5 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -186,7 +186,6 @@ void vp9_initialize_enc(void) { if (!init_done) { vp9_rtcd(); vp9_init_intra_predictors(); - vp9_tokenize_initialize(); vp9_init_me_luts(); vp9_rc_init_minq_luts(); vp9_entropy_mv_init(); @@ -608,7 +607,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth = oxcf->use_highbitdepth; #endif - cm->color_space = UNKNOWN; + cm->color_space = oxcf->color_space; cm->width = oxcf->width; cm->height = oxcf->height; @@ -1265,6 +1264,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; + cm->color_space = oxcf->color_space; if (cm->profile <= PROFILE_1) assert(cm->bit_depth == VPX_BITS_8); @@ -1348,17 +1348,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); #endif - -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) { - vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS); - } -#endif } #ifndef M_LOG2_E @@ -1406,8 +1395,8 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { unsigned int i; - VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP)); - VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL; + VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP)); + VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; if (!cm) return NULL; @@ -1792,14 +1781,12 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) { - vp9_denoiser_free(&(cpi->denoiser)); - } + vp9_denoiser_free(&(cpi->denoiser)); #endif for (t = 0; t < cpi->num_workers; ++t) { VP9Worker *const worker = &cpi->workers[t]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; // Deallocate allocated threads. vp9_get_worker_interface()->end(worker); @@ -1810,11 +1797,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_free_pc_tree(thread_data->td); vpx_free(thread_data->td); } - - vpx_free(worker->data1); } + vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); + if (cpi->num_workers > 1) + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + dealloc_compressor_data(cpi); for (i = 0; i < sizeof(cpi->mbgraph_stats) / @@ -2140,19 +2129,19 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { } while (--h); src = s->u_buffer; - h = s->uv_height / 2; + h = s->uv_height; do { - fwrite(src, s->uv_width / 2, 1, f); - src += s->uv_stride + s->uv_width / 2; + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; } while (--h); src = s->v_buffer; - h = s->uv_height / 2; + h = s->uv_height; do { - fwrite(src, s->uv_width / 2, 1, f); - src += s->uv_stride + s->uv_width / 2; + fwrite(src, s->uv_width, 1, f); + src += s->uv_stride; } while (--h); } #endif @@ -2450,7 +2439,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { } if (lf->filter_level > 0) { - vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, + lf->filter_level, 0, 0, + cpi->workers, cpi->num_workers, + &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); } vp9_extend_frame_inner_borders(cm->frame_to_show); @@ -2550,7 +2545,7 @@ static void full_to_model_counts(vp9_coeff_count_model *model_count, static void output_frame_level_debug_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); - int recon_err; + int64_t recon_err; vp9_clear_system_state(); @@ -2562,7 +2557,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" "%6d %6d %5d %5d %5d " "%10"PRId64" %10.3lf" - "%10lf %8u %10d %10d %10d\n", + "%10lf %8u %10"PRId64" %10d %10d\n", cpi->common.current_video_frame, cpi->rc.this_frame_target, cpi->rc.projected_frame_size, cpi->rc.projected_frame_size / cpi->common.MBs, @@ -2891,15 +2886,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, rc->this_key_frame_forced && (rc->projected_frame_size < rc->max_frame_bandwidth)) { int last_q = q; - int kf_err; + int64_t kf_err; - int high_err_target = cpi->ambient_err; - int low_err_target = cpi->ambient_err >> 1; + int64_t high_err_target = cpi->ambient_err; + int64_t low_err_target = cpi->ambient_err >> 1; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { - kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm), - cm->bit_depth); + kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); } else { kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); } @@ -2920,7 +2914,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, q_high = q > q_low ? q - 1 : q_low; // Adjust Q - q = (q * high_err_target) / kf_err; + q = (int)((q * high_err_target) / kf_err); q = MIN(q, (q_high + q_low) >> 1); } else if (kf_err < low_err_target && rc->projected_frame_size >= frame_under_shoot_limit) { @@ -2929,7 +2923,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, q_low = q < q_high ? q + 1 : q_high; // Adjust Q - q = (q * low_err_target) / kf_err; + q = (int)((q * low_err_target) / kf_err); q = MIN(q, (q_high + q_low + 1) >> 1); } @@ -3257,8 +3251,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source, - get_frame_new_buffer(cm), - cm->bit_depth); + get_frame_new_buffer(cm)); } else { cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); } @@ -3409,6 +3402,20 @@ static void check_initial_width(VP9_COMP *cpi, } } +#if CONFIG_VP9_TEMPORAL_DENOISING +static void setup_denoiser_buffer(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS); + } +} +#endif int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, @@ -3425,6 +3432,9 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, check_initial_width(cpi, subsampling_x, subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags)) @@ -3435,13 +3445,13 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && (subsampling_x != 1 || subsampling_y != 1)) { vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, - "Non-4:2:0 color space requires profile 1 or 3"); + "Non-4:2:0 color format requires profile 1 or 3"); res = -1; } if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && (subsampling_x == 1 && subsampling_y == 1)) { vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, - "4:2:0 color space requires profile 0 or 2"); + "4:2:0 color format requires profile 0 or 2"); res = -1; } @@ -3999,6 +4009,10 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, check_initial_width(cpi, 1, 1); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + if (width) { cm->width = width; if (cm->width > cpi->initial_width) { @@ -4027,41 +4041,25 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) { return; } -int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { +int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); - return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height); + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); } #if CONFIG_VP9_HIGHBITDEPTH -int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, - vpx_bit_depth_t bit_depth) { - unsigned int sse; - int sum; +int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { assert(a->y_crop_width == b->y_crop_width); assert(a->y_crop_height == b->y_crop_height); assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); - switch (bit_depth) { - case VPX_BITS_8: - highbd_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height, &sse, &sum); - return (int) sse; - case VPX_BITS_10: - highbd_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height, &sse, &sum); - return (int) sse; - case VPX_BITS_12: - highbd_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, - a->y_crop_width, a->y_crop_height, &sse, &sum); - return (int) sse; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; - } + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 7872e2cc1..cf269c108 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -19,6 +19,7 @@ #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_thread.h" @@ -36,6 +37,7 @@ #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" + #if CONFIG_VP9_TEMPORAL_DENOISING #include "vp9/encoder/vp9_denoiser.h" #endif @@ -231,6 +233,7 @@ typedef struct VP9EncoderConfig { #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth; #endif + vpx_color_space_t color_space; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -261,6 +264,8 @@ typedef struct ThreadData { PC_TREE *pc_root; } ThreadData; +struct EncWorkerData; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -304,7 +309,7 @@ typedef struct VP9_COMP { unsigned int tok_count[4][1 << 6]; // Ambient reconstruction err target for force key frames - int ambient_err; + int64_t ambient_err; RD_OPT rd; @@ -446,6 +451,8 @@ typedef struct VP9_COMP { // Multi-threading int num_workers; VP9Worker *workers; + struct EncWorkerData *tile_thr_data; + VP9LfSync lf_row_sync; } VP9_COMP; void vp9_initialize_enc(void); @@ -534,11 +541,10 @@ static INLINE int allocated_tokens(TileInfo tile) { return get_token_alloc(tile_mb_rows, tile_mb_cols); } -int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH -int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, - const YV12_BUFFER_CONFIG *b, - vpx_bit_depth_t bit_depth); +int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); #endif // CONFIG_VP9_HIGHBITDEPTH void vp9_alloc_compressor_data(VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index daf3da44c..12fb4d107 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -167,23 +167,24 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; - winterface->init(worker); - CHECK_MEM_ERROR(cm, worker->data1, - (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData))); - thread_data = (EncWorkerData*)worker->data1; if (i < num_workers - 1) { thread_data->cpi = cpi; // Allocate thread data. CHECK_MEM_ERROR(cm, thread_data->td, - vpx_calloc(1, sizeof(*thread_data->td))); + vpx_memalign(32, sizeof(*thread_data->td))); + vp9_zero(*thread_data->td); + // Set up pc_tree. thread_data->td->leaf_tree = NULL; thread_data->td->pc_tree = NULL; @@ -203,17 +204,18 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { thread_data->td = &cpi->td; } - // data2 is unused. - worker->data2 = NULL; - winterface->sync(worker); - worker->hook = (VP9WorkerHook)enc_worker_hook; } } for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *thread_data; + + worker->hook = (VP9WorkerHook)enc_worker_hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = NULL; + thread_data = (EncWorkerData*)worker->data1; // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 81334e448..a95f0f46d 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -33,16 +33,23 @@ static int get_max_filter_level(const VP9_COMP *cpi) { } -static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi, - int filt_level, int partial_frame) { +static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, + VP9_COMP *const cpi, + int filt_level, int partial_frame) { VP9_COMMON *const cm = &cpi->common; - int filt_err; + int64_t filt_err; + + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, + filt_level, 1, partial_frame, + cpi->workers, cpi->num_workers, &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, + 1, partial_frame); - vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, - partial_frame); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { - filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth); + filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show); } else { filt_err = vp9_get_y_sse(sd, cm->frame_to_show); } @@ -63,14 +70,15 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); int filt_direction = 0; - int best_err, filt_best; + int64_t best_err; + int filt_best; // Start the search at the previous frame filter level unless it is now out of // range. int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Sum squared error at each filter level - int ss_err[MAX_LOOP_FILTER + 1]; + int64_t ss_err[MAX_LOOP_FILTER + 1]; // Set each entry to -1 vpx_memset(ss_err, 0xFF, sizeof(ss_err)); @@ -87,7 +95,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, const int filt_low = MAX(filt_mid - filter_step, min_filter_level); // Bias against raising loop filter in favor of lowering it. - int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) bias = (bias * cpi->twopass.section_intra_rating) / 20; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 319a47833..5acfcc51d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -540,8 +540,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac; const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); - const int8_t segment_id = mbmi->segment_id; - const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; INTERP_FILTER filter_ref; const int bsl = mi_width_log2_lookup[bsize]; @@ -605,7 +604,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, tx_mode_to_biggest_tx_size[cm->tx_mode]); mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; - mbmi->segment_id = segment_id; + +#if CONFIG_VP9_TEMPORAL_DENOISING + vp9_denoiser_reset_frame_stats(ctx); +#endif for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; @@ -662,6 +664,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd); mbmi->ref_frame[0] = ref_frame; + set_ref_ptrs(cm, xd, ref_frame, NONE); for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int rate_mv = 0; @@ -946,3 +949,247 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rd_cost = best_rdc; } + +void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + TileInfo *const tile_info = &tile_data->tile_info; + SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi; + const struct segmentation *const seg = &cm->seg; + MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE; + MV_REFERENCE_FRAME best_ref_frame = NONE; + unsigned char segment_id = mbmi->segment_id; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int64_t best_rd = INT64_MAX; + b_mode_info bsi[MAX_REF_FRAMES][4]; + int ref_frame_skip_mask = 0; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + + x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + ctx->pred_pixel_ready = 0; + + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + int_mv dummy_mv[2]; + x->pred_mv_sad[ref_frame] = INT_MAX; + + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + int_mv *const candidates = mbmi->ref_mvs[ref_frame]; + const struct scale_factors *const sf = + &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, + sf, sf); + vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame, + candidates, mi_row, mi_col); + + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &dummy_mv[0], &dummy_mv[1]); + } else { + ref_frame_skip_mask |= (1 << ref_frame); + } + } + + mbmi->sb_type = bsize; + mbmi->tx_size = TX_4X4; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP + : cm->interp_filter; + + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { + int64_t this_rd = 0; + int plane; + + if (ref_frame_skip_mask & (1 << ref_frame)) + continue; + + // TODO(jingning, agrange): Scaling reference frame not supported for + // sub8x8 blocks. Is this supported now? + if (ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) + continue; + + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + mbmi->ref_frame[0] = ref_frame; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (plane = 0; plane < MAX_MB_PLANE; plane++) + xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane]; + + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + int_mv b_mv[MB_MODE_COUNT]; + int64_t b_best_rd = INT64_MAX; + const int i = idy * 2 + idx; + PREDICTION_MODE this_mode; + int b_rate = 0; + int64_t b_dist = 0; + RD_COST this_rdc; + unsigned int var_y, sse_y; + + struct macroblock_plane *p = &x->plane[0]; + struct macroblockd_plane *pd = &xd->plane[0]; + + const struct buf_2d orig_src = p->src; + const struct buf_2d orig_dst = pd->dst; + struct buf_2d orig_pre[2]; + vpx_memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre)); + + // set buffer pointers for sub8x8 motion search. + p->src.buf = + &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + pd->dst.buf = + &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; + pd->pre[0].buf = + &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, + i, pd->pre[0].stride)]; + + b_mv[ZEROMV].as_int = 0; + b_mv[NEWMV].as_int = INVALID_MV; + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col, + &b_mv[NEARESTMV], + &b_mv[NEARMV]); + + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + xd->mi[0].bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int; + + if (this_mode == NEWMV) { + const int step_param = cpi->sf.mv.fullpel_search_step_param; + MV mvp_full; + MV tmp_mv; + int cost_list[5]; + const int tmp_col_min = x->mv_col_min; + const int tmp_col_max = x->mv_col_max; + const int tmp_row_min = x->mv_row_min; + const int tmp_row_max = x->mv_row_max; + int dummy_dist; + + if (i == 0) { + mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3; + mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3; + } else { + mvp_full.row = xd->mi[0].bmi[0].as_mv[0].as_mv.row >> 3; + mvp_full.col = xd->mi[0].bmi[0].as_mv[0].as_mv.col >> 3; + } + + vp9_set_mv_search_range(x, &mbmi->ref_mvs[0]->as_mv); + + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, x->sadperbit4, + cond_cost_list(cpi, cost_list), + &mbmi->ref_mvs[ref_frame][0].as_mv, &tmp_mv, + INT_MAX, 0); + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + // calculate the bit cost on motion vector + mvp_full.row = tmp_mv.row * 8; + mvp_full.col = tmp_mv.col * 8; + + b_rate += vp9_mv_bit_cost(&mvp_full, + &mbmi->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, + MV_COST_WEIGHT); + + b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] + [INTER_OFFSET(NEWMV)]; + if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd) + continue; + + cpi->find_fractional_mv_step(x, &tmp_mv, + &mbmi->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, + &dummy_dist, + &x->pred_sse[ref_frame], NULL, 0, 0); + + xd->mi[0].bmi[i].as_mv[0].as_mv = tmp_mv; + } + + vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride, + pd->dst.buf, pd->dst.stride, + &xd->mi[0].bmi[i].as_mv[0].as_mv, + &xd->block_refs[0]->sf, + 4 * num_4x4_blocks_wide, + 4 * num_4x4_blocks_high, 0, + vp9_get_interp_kernel(mbmi->interp_filter), + MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i & 0x01), + mi_row * MI_SIZE + 4 * (i >> 1)); + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y); + + this_rdc.rate += b_rate; + this_rdc.dist += b_dist; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, + this_rdc.rate, this_rdc.dist); + if (this_rdc.rdcost < b_best_rd) { + b_best_rd = this_rdc.rdcost; + bsi[ref_frame][i].as_mode = this_mode; + bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0].bmi[i].as_mv[0].as_mv; + } + } // mode search + + // restore source and prediction buffer pointers. + p->src = orig_src; + pd->pre[0] = orig_pre[0]; + pd->dst = orig_dst; + this_rd += b_best_rd; + + xd->mi[0].bmi[i] = bsi[ref_frame][i]; + if (num_4x4_blocks_wide > 1) + xd->mi[0].bmi[i + 1] = xd->mi[0].bmi[i]; + if (num_4x4_blocks_high > 1) + xd->mi[0].bmi[i + 2] = xd->mi[0].bmi[i]; + } + } // loop through sub8x8 blocks + + if (this_rd < best_rd) { + best_rd = this_rd; + best_ref_frame = ref_frame; + } + } // reference frames + + mbmi->tx_size = TX_4X4; + mbmi->ref_frame[0] = best_ref_frame; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const int block = idy * 2 + idx; + xd->mi[0].bmi[block] = bsi[best_ref_frame][block]; + if (num_4x4_blocks_wide > 1) + xd->mi[0].bmi[block + 1] = bsi[best_ref_frame][block]; + if (num_4x4_blocks_high > 1) + xd->mi[0].bmi[block + 2] = bsi[best_ref_frame][block]; + } + } + mbmi->mode = xd->mi[0].bmi[3].as_mode; + ctx->mic = *(xd->mi[0].src_mi); + ctx->skip_txfm[0] = 0; + ctx->skip = 0; + // Dummy assignment for speed -5. No effect in speed -6. + rd_cost->rdcost = best_rd; +} diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h index 57a1cf937..11f44099c 100644 --- a/vp9/encoder/vp9_pickmode.h +++ b/vp9/encoder/vp9_pickmode.h @@ -26,6 +26,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); +void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, + TileDataEnc *tile_data, + int mi_row, int mi_col, RD_COST *rd_cost, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 34d49f058..375407d44 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -516,6 +516,20 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, } } +int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, + int raster_block, int stride) { + const int bw = b_width_log2_lookup[plane_bsize]; + const int y = 4 * (raster_block >> bw); + const int x = 4 * (raster_block & ((1 << bw) - 1)); + return y * stride + x; +} + +int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, + int raster_block, int16_t *base) { + const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + return base + vp9_raster_block_offset(plane_bsize, raster_block, stride); +} + const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, int ref_frame) { const VP9_COMMON *const cm = &cpi->common; diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index e1593af5a..59a87cf98 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -141,6 +141,12 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, int vp9_get_switchable_rate(const struct VP9_COMP *cpi, const MACROBLOCKD *const xd); +int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, + int raster_block, int stride); + +int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, + int raster_block, int16_t *base); + const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi, int ref_frame); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index ded082f86..a183fdc69 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -129,19 +129,6 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { {{INTRA_FRAME, NONE}}, }; -static int raster_block_offset(BLOCK_SIZE plane_bsize, - int raster_block, int stride) { - const int bw = b_width_log2_lookup[plane_bsize]; - const int y = 4 * (raster_block >> bw); - const int x = 4 * (raster_block & ((1 << bw) - 1)); - return y * stride + x; -} -static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, - int raster_block, int16_t *base) { - const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; - return base + raster_block_offset(plane_bsize, raster_block, stride); -} - static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, int min_plane, int max_plane) { int i; @@ -360,6 +347,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x, uint8_t token_cache[32 * 32]; int pt = combine_entropy_contexts(*A, *L); int c, cost; +#if CONFIG_VP9_HIGHBITDEPTH + const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); +#else + const int16_t *cat6_high_cost = vp9_get_high_cost_table(8); +#endif + // Check for consistency of tx_size with mode info assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size : get_uv_tx_size(mbmi, pd) == tx_size); @@ -373,23 +366,29 @@ static INLINE int cost_coeffs(MACROBLOCK *x, // dc token int v = qcoeff[0]; - int prev_t = vp9_get_token(v); - cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; + int16_t prev_t; + EXTRABIT e; + vp9_get_token_extra(v, &prev_t, &e); + cost = (*token_costs)[0][pt][prev_t] + + vp9_get_cost(prev_t, e, cat6_high_cost); + token_cache[0] = vp9_pt_energy_class[prev_t]; ++token_costs; // ac tokens for (c = 1; c < eob; c++) { const int rc = scan[c]; - int t; + int16_t t; v = qcoeff[rc]; - t = vp9_get_token(v); + vp9_get_token_extra(v, &t, &e); if (use_fast_coef_costing) { - cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v]; + cost += (*token_costs)[!prev_t][!prev_t][t] + + vp9_get_cost(t, e, cat6_high_cost); } else { pt = get_coef_context(nb, token_cache, c); - cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; + cost += (*token_costs)[!prev_t][pt][t] + + vp9_get_cost(t, e, cat6_high_cost); token_cache[rc] = vp9_pt_energy_class[t]; } prev_t = t; @@ -761,10 +760,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, struct macroblockd_plane *pd = &xd->plane[0]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; - const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib, - src_stride)]; - uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib, - dst_stride)]; + const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib, + src_stride)]; + uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib, + dst_stride)]; ENTROPY_CONTEXT ta[2], tempa[2]; ENTROPY_CONTEXT tl[2], templ[2]; @@ -808,8 +807,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, const int block = ib + idy * 2 + idx; const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; - int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block, - p->src_diff); + int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, + block, + p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); xd->mi[0].src_mi->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, block, 1, @@ -908,8 +908,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, const int block = ib + idy * 2 + idx; const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; - int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block, - p->src_diff); + int16_t *const src_diff = + vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); xd->mi[0].src_mi->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, block, 1, @@ -1341,10 +1341,10 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize]; int idx, idy; - const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i, - p->src.stride)]; - uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i, - pd->dst.stride)]; + const uint8_t *const src = + &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, + pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0, ref; const scan_order *so = &vp9_default_scan_orders[TX_4X4]; @@ -1352,7 +1352,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); for (ref = 0; ref < 1 + is_compound; ++ref) { - const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i, + const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[ref].stride)]; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -1386,17 +1386,17 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_subtract_block( - height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, - src, p->src.stride, dst, pd->dst.stride, xd->bd); + height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride, xd->bd); } else { vp9_subtract_block( - height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, - src, p->src.stride, dst, pd->dst.stride); + height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); } #else vp9_subtract_block(height, width, - raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, - src, p->src.stride, dst, pd->dst.stride); + vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), + 8, src, p->src.stride, dst, pd->dst.stride); #endif // CONFIG_VP9_HIGHBITDEPTH k = i; @@ -1407,7 +1407,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, k += (idy * 2 + idx); coeff = BLOCK_OFFSET(p->coeff, k); - x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); #if CONFIG_VP9_HIGHBITDEPTH @@ -1480,13 +1480,14 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; - p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)]; + p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, + p->src.stride)]; assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); - pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i, - pd->pre[0].stride)]; + pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i, + pd->pre[0].stride)]; if (has_second_ref(mbmi)) - pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i, - pd->pre[1].stride)]; + pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i, + pd->pre[1].stride)]; } static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 31e93be65..82bce3780 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -279,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org, int vp9_svc_start_frame(VP9_COMP *const cpi) { int width = 0, height = 0; LAYER_CONTEXT *lc; + struct lookahead_entry *buf; int count = 1 << (cpi->svc.number_temporal_layers - 1); cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; @@ -339,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) { // since its previous frame could be changed during decoding time. The idea is // we put a empty invisible frame in front of them, then we will not use // prev_mi when encoding these frames. + + buf = vp9_lookahead_peek(cpi->lookahead, 0); if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && - cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) { + cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE && + lc->rc.frames_to_key != 0 && + !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) { if ((cpi->svc.number_temporal_layers > 1 && cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || (cpi->svc.number_spatial_layers > 1 && diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 54b2594e7..4c8995356 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -23,19 +23,6 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_tokenize.h" -static int16_t dct_value_cost[DCT_MAX_VALUE * 2]; -const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; - -#if CONFIG_VP9_HIGHBITDEPTH -static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2]; -const int16_t *vp9_dct_value_cost_high10_ptr = - dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10; - -static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2]; -const int16_t *vp9_dct_value_cost_high12_ptr = - dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12; -#endif - static const TOKENVALUE dct_cat_lt_10_value_tokens[] = { {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49}, {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33}, @@ -98,6 +85,298 @@ static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0}; static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0}; +static const int16_t zero_cost[] = {0}; +static const int16_t one_cost[] = {255, 257}; +static const int16_t two_cost[] = {255, 257}; +static const int16_t three_cost[] = {255, 257}; +static const int16_t four_cost[] = {255, 257}; +static const int16_t cat1_cost[] = {429, 431, 616, 618}; +static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953}; +static const int16_t cat3_cost[] = { + 820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218, + 1289, 1291 +}; +static const int16_t cat4_cost[] = { + 1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239, + 1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446, + 1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608 +}; +static const int16_t cat5_cost[] = { + 1269, 1271, 1283, 1285, 1306, 1308, 1320, + 1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457, + 1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572, + 1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684, + 1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817, + 1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897 +}; +const int16_t vp9_cat6_low_cost[256] = { + 1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662, + 1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721, + 1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767, + 1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829, + 1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884, + 1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950, + 1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996, + 2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055, + 2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113, + 2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172, + 2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218, + 2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124, + 2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179, + 2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241, + 2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287, + 2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346, + 2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408, + 2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467, + 2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513, + 2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575, + 2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630, + 2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662 +}; +const int16_t vp9_cat6_high_cost[128] = { + 72, 892, 1183, 2003, 1448, 2268, 2559, 3379, + 1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049, + 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062, + 2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, + 5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, + 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049, + 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062, + 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732, + 7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, + 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141, + 7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +const int16_t vp9_cat6_high10_high_cost[512] = { + 74, 894, 1185, 2005, 1450, 2270, 2561, + 3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231, + 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244, + 7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, + 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, + 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231, + 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244, + 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, + 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653, + 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, + 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, + 11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, + 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, + 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, + 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, + 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, + 9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, + 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, + 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, + 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, + 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, + 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120, + 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, + 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, + 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, + 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, + 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, + 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, + 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588, + 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212, + 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, + 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565, + 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277, + 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, + 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, + 9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, + 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, + 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, + 12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, + 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, + 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258, + 9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, + 11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791, + 13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248, +}; +const int16_t vp9_cat6_high12_high_cost[2048] = { + 76, 896, 1187, 2007, 1452, 2272, 2563, + 3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233, + 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, + 7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, + 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, + 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233, + 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, + 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, + 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, + 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, + 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, + 11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, + 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, + 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, + 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, + 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, + 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, + 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, + 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, + 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, + 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, + 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122, + 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, + 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, + 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, + 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, + 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, + 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, + 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, + 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, + 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, + 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, + 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, + 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, + 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, + 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, + 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, + 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, + 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, + 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, + 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, + 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, + 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, + 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942, + 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, + 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, + 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, + 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, + 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, + 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, + 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, + 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, + 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, + 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, + 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099, + 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, + 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, + 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, + 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, + 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, + 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, + 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, + 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, + 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, + 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, + 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988, + 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, + 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, + 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, + 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, + 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, + 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, + 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, + 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, + 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, + 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, + 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214, + 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, + 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, + 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, + 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, + 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, + 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, + 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, + 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, + 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, + 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, + 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, + 15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609, + 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, + 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, + 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, + 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, + 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, + 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, + 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, + 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, + 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, + 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, + 12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, + 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, + 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, + 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, + 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, + 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, + 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, + 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, + 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, + 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, + 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, + 13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, + 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, + 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, + 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, + 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, + 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, + 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, + 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, + 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, + 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, + 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, + 13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, + 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, + 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, + 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747, + 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, + 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, + 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, + 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, + 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, + 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, + 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, + 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, + 17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, + 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, + 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, + 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, + 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, + 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, + 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, + 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, + 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747, + 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, + 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, + 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, + 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, + 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, + 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, + 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, + 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, + 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, + 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, + 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, + 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, + 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, + 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325, + 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, + 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, + 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, + 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, + 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, + 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, + 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, + 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, + 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, + 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, + 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, + 16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, + 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, + 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, + 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, + 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, + 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, + 17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, + 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, + 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, + 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, + 15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594, + 16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342 +}; +#endif + #if CONFIG_VP9_HIGHBITDEPTH static const vp9_tree_index cat1_high10[2] = {0, 0}; static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0}; @@ -117,61 +396,49 @@ static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 30, 30, 32, 32, 34, 34, 0, 0}; #endif -static void init_bit_tree(vp9_tree_index *p, int n) { - int i = 0; - - while (++i < n) { - p[0] = p[1] = i << 1; - p += 2; - } - - p[0] = p[1] = 0; -} - - const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = { - {0, 0, 0, 0}, // ZERO_TOKEN - {0, 0, 0, 1}, // ONE_TOKEN - {0, 0, 0, 2}, // TWO_TOKEN - {0, 0, 0, 3}, // THREE_TOKEN - {0, 0, 0, 4}, // FOUR_TOKEN - {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN - {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN - {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN - {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN - {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN - {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL}, // CATEGORY6_TOKEN - {0, 0, 0, 0} // EOB_TOKEN + {0, 0, 0, 0, zero_cost}, // ZERO_TOKEN + {0, 0, 0, 1, one_cost}, // ONE_TOKEN + {0, 0, 0, 2, two_cost}, // TWO_TOKEN + {0, 0, 0, 3, three_cost}, // THREE_TOKEN + {0, 0, 0, 4, four_cost}, // FOUR_TOKEN + {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost}, // CATEGORY1_TOKEN + {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost}, // CATEGORY2_TOKEN + {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost}, // CATEGORY3_TOKEN + {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost}, // CATEGORY4_TOKEN + {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost}, // CATEGORY5_TOKEN + {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0}, // CATEGORY6_TOKEN + {0, 0, 0, 0, zero_cost} // EOB_TOKEN }; #if CONFIG_VP9_HIGHBITDEPTH const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = { - {0, 0, 0, 0}, // ZERO_TOKEN - {0, 0, 0, 1}, // ONE_TOKEN - {0, 0, 0, 2}, // TWO_TOKEN - {0, 0, 0, 3}, // THREE_TOKEN - {0, 0, 0, 4}, // FOUR_TOKEN - {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN - {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN - {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN - {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN - {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN - {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL}, // CATEGORY6_TOKEN - {0, 0, 0, 0} // EOB_TOKEN + {0, 0, 0, 0, zero_cost}, // ZERO + {0, 0, 0, 1, one_cost}, // ONE + {0, 0, 0, 2, two_cost}, // TWO + {0, 0, 0, 3, three_cost}, // THREE + {0, 0, 0, 4, four_cost}, // FOUR + {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1 + {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2 + {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3 + {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4 + {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5 + {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0}, // CAT6 + {0, 0, 0, 0, zero_cost} // EOB }; const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = { - {0, 0, 0, 0}, // ZERO_TOKEN - {0, 0, 0, 1}, // ONE_TOKEN - {0, 0, 0, 2}, // TWO_TOKEN - {0, 0, 0, 3}, // THREE_TOKEN - {0, 0, 0, 4}, // FOUR_TOKEN - {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN - {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN - {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN - {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN - {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN - {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL}, // CATEGORY6_TOKEN - {0, 0, 0, 0} // EOB_TOKEN + {0, 0, 0, 0, zero_cost}, // ZERO + {0, 0, 0, 1, one_cost}, // ONE + {0, 0, 0, 2, two_cost}, // TWO + {0, 0, 0, 3, three_cost}, // THREE + {0, 0, 0, 4, four_cost}, // FOUR + {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1 + {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2 + {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3 + {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4 + {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5 + {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0}, // CAT6 + {0, 0, 0, 0, zero_cost} // EOB }; #endif @@ -180,46 +447,6 @@ const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = { {125, 7}, {126, 7}, {127, 7}, {0, 1} }; -static void tokenize_init_one(const vp9_extra_bit *const e, - int16_t *value_cost, int max_value) { - int i = -max_value; - - TOKENVALUE t; - do { - - vp9_get_token_extra(i, &t.token, &t.extra); - // initialize the cost for extra bits for all possible coefficient value. - { - int cost = 0; - const vp9_extra_bit *p = &e[t.token]; - - if (p->base_val) { - const int extra = t.extra; - const int length = p->len; - - if (length) - cost += treed_cost(p->tree, p->prob, extra >> 1, length); - - cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */ - value_cost[i] = cost; - } - } - } while (++i < max_value); -} - -void vp9_tokenize_initialize() { - tokenize_init_one(vp9_extra_bits, - dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE); -#if CONFIG_VP9_HIGHBITDEPTH - tokenize_init_one(vp9_extra_bits_high10, - dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10, - DCT_MAX_VALUE_HIGH10); - - tokenize_init_one(vp9_extra_bits_high12, - dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12, - DCT_MAX_VALUE_HIGH12); -#endif -} struct tokenize_b_args { VP9_COMP *cpi; diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 49dc4a718..81cc2e13f 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -20,8 +20,6 @@ extern "C" { #endif -void vp9_tokenize_initialize(); - #define EOSB_TOKEN 127 // Not signalled, encoder only #if CONFIG_VP9_HIGHBITDEPTH @@ -63,11 +61,29 @@ extern const int16_t *vp9_dct_value_cost_ptr; */ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens; +extern const int16_t vp9_cat6_low_cost[256]; +extern const int16_t vp9_cat6_high_cost[128]; +extern const int16_t vp9_cat6_high10_high_cost[512]; +extern const int16_t vp9_cat6_high12_high_cost[2048]; +static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits, + const int16_t *cat6_high_table) { + if (token != CATEGORY6_TOKEN) + return vp9_extra_bits[token].cost[extrabits]; + return vp9_cat6_low_cost[extrabits & 0xff] + + cat6_high_table[extrabits >> 8]; +} + #if CONFIG_VP9_HIGHBITDEPTH -extern const int16_t *vp9_dct_value_cost_high10_ptr; -extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr; -extern const int16_t *vp9_dct_value_cost_high12_ptr; -extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr; +static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) { + return bit_depth == 8 ? vp9_cat6_high_cost + : (bit_depth == 10 ? vp9_cat6_high10_high_cost : + vp9_cat6_high12_high_cost); +} +#else +static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) { + (void) bit_depth; + return vp9_cat6_high_cost; +} #endif // CONFIG_VP9_HIGHBITDEPTH static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) { diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index f5e6e3190..06096a6b1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h VP9_COMMON_SRCS-yes += common/vp9_enums.h VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h VP9_COMMON_SRCS-yes += common/vp9_mv.h VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h VP9_COMMON_SRCS-yes += common/vp9_pred_common.h @@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h VP9_COMMON_SRCS-yes += common/vp9_quant_common.c @@ -133,11 +135,14 @@ ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c # neon with assembly and intrinsics implementations. If both are available # prefer assembly. @@ -156,9 +161,7 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) @@ -176,8 +179,11 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c +# TODO(johannkoenig): re-enable when chromium build is fixed +# # https://code.google.com/p/chromium/issues/detail?id=443839 +#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 7b4b17809..46e6e919c 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -42,6 +42,7 @@ struct vp9_extracfg { unsigned int frame_periodic_boost; vpx_bit_depth_t bit_depth; vp9e_tune_content content; + vpx_color_space_t color_space; }; static struct vp9_extracfg default_extra_cfg = { @@ -64,7 +65,8 @@ static struct vp9_extracfg default_extra_cfg = { NO_AQ, // aq_mode 0, // frame_periodic_delta_q VPX_BITS_8, // Bit depth - VP9E_CONTENT_DEFAULT // content + VP9E_CONTENT_DEFAULT, // content + VPX_CS_UNKNOWN, // color space }; struct vpx_codec_alg_priv { @@ -294,7 +296,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, cfg->g_bit_depth == VPX_BITS_8) { ERROR("Codec bit-depth 8 not supported in profile > 1"); } - + RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB); return VPX_CODEC_OK; } @@ -351,9 +353,9 @@ static int get_image_bps(const vpx_image_t *img) { } static vpx_codec_err_t set_encoder_config( - VP9EncoderConfig *oxcf, - const vpx_codec_enc_cfg_t *cfg, - const struct vp9_extracfg *extra_cfg) { + VP9EncoderConfig *oxcf, + const vpx_codec_enc_cfg_t *cfg, + const struct vp9_extracfg *extra_cfg) { const int is_vbr = cfg->rc_end_usage == VPX_VBR; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; @@ -437,6 +439,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; #endif + oxcf->color_space = extra_cfg->color_space; oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; oxcf->arnr_strength = extra_cfg->arnr_strength; @@ -1322,6 +1325,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP8_COPY_REFERENCE, ctrl_copy_reference}, {VP8E_UPD_ENTROPY, ctrl_update_entropy}, @@ -1357,6 +1367,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback}, {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, + {VP9E_SET_COLOR_SPACE, ctrl_set_color_space}, {VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, // Getters diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 43bf35f9c..c0e429736 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -95,12 +95,11 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { static int parse_bitdepth_colorspace_sampling( BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) { - const int sRGB = 7; - int colorspace; + vpx_color_space_t color_space; if (profile >= PROFILE_2) rb->bit_offset += 1; // Bit-depth 10 or 12. - colorspace = vp9_rb_read_literal(rb, 3); - if (colorspace != sRGB) { + color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3); + if (color_space != VPX_CS_SRGB) { rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. if (profile == PROFILE_1 || profile == PROFILE_3) { rb->bit_offset += 2; // subsampling x/y. @@ -148,7 +147,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, if (frame_marker != VP9_FRAME_MARKER) return VPX_CODEC_UNSUP_BITSTREAM; - if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM; + if (profile >= MAX_PROFILES) + return VPX_CODEC_UNSUP_BITSTREAM; + + if ((profile >= 2 && data_sz <= 1) || data_sz < 1) + return VPX_CODEC_UNSUP_BITSTREAM; if (vp9_rb_read_bit(&rb)) { // show an existing frame vp9_rb_read_literal(&rb, 3); // Frame buffer to show. diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h index 00fbfdd7d..e585aa147 100644 --- a/vp9/vp9_iface_common.h +++ b/vp9/vp9_iface_common.h @@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, bps = 12; } } + img->cs = yv12->color_space; img->bit_depth = 8; img->w = yv12->y_stride; img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); @@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; + yv12->color_space = img->cs; #if CONFIG_VP9_HIGHBITDEPTH if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index c75fd8a01..33a1e6735 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -150,6 +150,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 1fcb36f66..603158a9c 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -21,8 +21,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_reader.h VP9_DX_SRCS-yes += decoder/vp9_reader.c VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 4fc0fd62f..a920ee3f9 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -10,7 +10,7 @@ #ifndef VPX_VP8CX_H_ #define VPX_VP8CX_H_ -/*!\defgroup vp8_encoder WebM VP8 Encoder +/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder * \ingroup vp8 * * @{ @@ -18,7 +18,7 @@ #include "./vp8.h" /*!\file - * \brief Provides definitions for using the VP8 encoder algorithm within the + * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the * vpx Codec Interface. */ @@ -28,17 +28,20 @@ extern "C" { /*!\name Algorithm interface for VP8 * - * This interface provides the capability to encode raw VP8 streams, as would - * be found in AVI files. + * This interface provides the capability to encode raw VP8 streams. * @{ */ extern vpx_codec_iface_t vpx_codec_vp8_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); +/*!@} - end algorithm interface member group*/ -/* TODO(jkoleszar): These move to VP9 in a later patch set. */ +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to encode raw VP9 streams. + * @{ + */ extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); - /*!@} - end algorithm interface member group*/ @@ -234,20 +237,111 @@ enum vp8e_enc_control_id { VP8E_SET_SCREEN_CONTENT_MODE, /**<control function to set encoder screen content mode */ - /* TODO(jkoleszar): Move to vp9cx.h */ + /*!\brief Codec control function to set lossless encoding mode + * + * VP9 can operate in lossless encoding mode, in which the bitstream + * produced will be able to decode and reconstruct a perfect copy of + * input source. This control function provides a mean to switch encoder + * into lossless coding mode(1) or normal coding mode(0) that may be lossy. + * 0 = lossy coding mode + * 1 = lossless coding mode + * + * By default, encoder operates in normal coding mode (maybe lossy). + */ VP9E_SET_LOSSLESS, + + /*!\brief Codec control function to set number of tile columns + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated vertical tile columns, which can be encoded or decoded + * independently. This enables easy implementation of parallel encoding and + * decoding. This control requests the encoder to use column tiles in + * encoding an input frame, with number of tile columns (in Log2 unit) as + * the parameter: + * 0 = 1 tile column + * 1 = 2 tile columns + * 2 = 4 tile columns + * ..... + * n = 2**n tile columns + * The requested tile columns will be capped by encoder based on image size + * limitation (The minimum width of a tile column is 256 pixel, the maximum + * is 4096). + * + * By default, the value is 0, i.e. one single column tile for entire image. + */ VP9E_SET_TILE_COLUMNS, + + /*!\brief Codec control function to set number of tile rows + * + * In encoding and decoding, VP9 allows an input image frame be partitioned + * into separated horizontal tile rows. Tile rows are encoded or decoded + * sequentially. Even though encoding/decoding of later tile rows depends on + * earlier ones, this allows the encoder to output data packets for tile rows + * prior to completely processing all tile rows in a frame, thereby reducing + * the latency in processing between input and output. The parameter + * for this control describes the number of tile rows, which has a valid + * range [0, 2]: + * 0 = 1 tile row + * 1 = 2 tile rows + * 2 = 4 tile rows + * + * By default, the value is 0, i.e. one single row tile for entire image. + */ VP9E_SET_TILE_ROWS, + + /*!\brief Codec control function to enable frame parallel decoding feature + * + * VP9 has a bitstream feature to reduce decoding dependency between frames + * by turning off backward update of probability context used in encoding + * and decoding. This allows staged parallel processing of more than one + * video frames in the decoder. This control function provides a mean to + * turn this feature on or off for bitstreams produced by encoder. + * + * By default, this feature is off. + */ VP9E_SET_FRAME_PARALLEL_DECODING, + + /*!\brief Codec control function to set adaptive quantization mode + * + * VP9 has a segment based feature that allows encoder to adaptively change + * quantization parameter for each segment within a frame to improve the + * subjective quality. This control makes encoder operate in one of the + * several AQ_modes supported. + * + * By default, encoder operates with AQ_Mode 0(adaptive quantization off). + */ VP9E_SET_AQ_MODE, + + /*!\brief Codec control function to enable/disable periodic Q boost + * + * One VP9 encoder speed feature is to enable quality boost by lowering + * frame level Q periodically. This control function provides a mean to + * turn on/off this feature. + * 0 = off + * 1 = on + * + * By default, the encoder is allowed to use this feature for appropriate + * encoding modes. + */ VP9E_SET_FRAME_PERIODIC_BOOST, + /*!\brief control function to set noise sensitivity * * 0: off, 1: OnYOnly */ VP9E_SET_NOISE_SENSITIVITY, + /*!\brief control function to turn on/off SVC in encoder. + * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not + * support SVC in its current encoding mode + * 0: off, 1: on + */ VP9E_SET_SVC, + + /*!\brief control function to set parameters for SVC. + * \note Parameters contain min_q, max_q, scaling factor for each of the + * SVC layers. + */ VP9E_SET_SVC_PARAMETERS, /*!\brief control function to set svc layer for spatial and temporal. @@ -256,9 +350,38 @@ enum vp8e_enc_control_id { * temporal layer. */ VP9E_SET_SVC_LAYER_ID, + + /*!\brief control function to set content type. + * \note Valid parameter range: + * VP9E_CONTENT_DEFAULT = Regular video content (Default) + * VP9E_CONTENT_SCREEN = Screen capture content + */ VP9E_SET_TUNE_CONTENT, + + /*!\brief control function to get svc layer ID. + * \note The layer ID returned is for the data packet from the registered + * callback function. + */ VP9E_GET_SVC_LAYER_ID, + + /*!\brief control function to register callback for getting per layer packet. + * \note Parameter for this control function is a structure with a callback + * function and a pointer to private data used by the callback. + */ VP9E_REGISTER_CX_CALLBACK, + + /*!\brief control function to set color space info. + * \note Valid ranges: 0..7, default is "UNKNOWN". + * 0 = UNKNOWN, + * 1 = BT_601 + * 2 = BT_709 + * 3 = SMPTE_170 + * 4 = SMPTE_240 + * 5 = BT_2020 + * 6 = RESERVED + * 7 = SRGB + */ + VP9E_SET_COLOR_SPACE, }; /*!\brief vpx 1-D scaling mode @@ -423,6 +546,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ + +VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus } // extern "C" diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index 5cc25cd6a..3ba72fe3c 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -9,13 +9,13 @@ */ -/*!\defgroup vp8_decoder WebM VP8 Decoder +/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder * \ingroup vp8 * * @{ */ /*!\file - * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder + * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder * interface. */ #ifndef VPX_VP8DX_H_ @@ -30,14 +30,18 @@ extern "C" { /*!\name Algorithm interface for VP8 * - * This interface provides the capability to decode raw VP8 streams, as would - * be found in AVI files and other non-Flash uses. + * This interface provides the capability to decode VP8 streams. * @{ */ extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); +/*!@} - end algorithm interface member group*/ -/* TODO(jkoleszar): These move to VP9 in a later patch set. */ +/*!\name Algorithm interface for VP9 + * + * This interface provides the capability to decode VP9 streams. + * @{ + */ extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); /*!@} - end algorithm interface member group*/ @@ -85,7 +89,14 @@ enum vp8_dec_control_id { */ VP9_SET_BYTE_ALIGNMENT, - /** For testing. */ + /** control function to invert the decoding order to from right to left. The + * function is used in a test to confirm the decoding independence of tile + * columns. The function may be used in application where this order + * of decoding is desired. + * + * TODO(yaowu): Rework the unit test that uses this control, and in a future + * release, this test-only control shall be removed. + */ VP9_INVERT_TILE_DECODE_ORDER, VP8_DECODER_CTRL_ID_MAX diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index da5bd0659..8f7bff518 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -232,8 +232,8 @@ extern "C" { /*!\brief Callback function pointer / user data pair storage */ typedef struct vpx_codec_enc_output_cx_cb_pair { - vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; - void *user_priv; + vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */ + void *user_priv; /**< Pointer to private data */ } vpx_codec_priv_output_cx_pkt_cb_pair_t; /*!\brief Rational Number @@ -737,10 +737,10 @@ extern "C" { * */ typedef struct vpx_svc_parameters { - int max_quantizers[VPX_SS_MAX_LAYERS]; - int min_quantizers[VPX_SS_MAX_LAYERS]; - int scaling_factor_num[VPX_SS_MAX_LAYERS]; - int scaling_factor_den[VPX_SS_MAX_LAYERS]; + int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */ + int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */ + int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/ + int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/ } vpx_svc_extra_cfg_t; diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h index 337e4c4be..c06d35101 100644 --- a/vpx/vpx_image.h +++ b/vpx/vpx_image.h @@ -28,7 +28,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/ #define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ @@ -66,9 +66,22 @@ extern "C" { VPX_IMG_FMT_I44016 = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ + /*!\brief List of supported color spaces */ + typedef enum vpx_color_space { + VPX_CS_UNKNOWN = 0, /**< Unknown */ + VPX_CS_BT_601 = 1, /**< BT.601 */ + VPX_CS_BT_709 = 2, /**< BT.709 */ + VPX_CS_SMPTE_170 = 3, /**< SMPTE.170 */ + VPX_CS_SMPTE_240 = 4, /**< SMPTE.240 */ + VPX_CS_BT_2020 = 5, /**< BT.2020 */ + VPX_CS_RESERVED = 6, /**< Reserved */ + VPX_CS_SRGB = 7 /**< sRGB */ + } vpx_color_space_t; /**< alias for enum vpx_color_space */ + /**\brief Image Descriptor */ typedef struct vpx_image { vpx_img_fmt_t fmt; /**< Image Format */ + vpx_color_space_t cs; /**< Color Space */ /* Image storage dimensions */ unsigned int w; /**< Stored image width */ diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h index f04dee1e8..6cdc235fe 100644 --- a/vpx_scale/yv12config.h +++ b/vpx_scale/yv12config.h @@ -55,6 +55,7 @@ typedef struct yv12_buffer_config { int subsampling_x; int subsampling_y; unsigned int bit_depth; + vpx_color_space_t color_space; int corrupted; int flags; @@ -442,7 +442,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_MAX_INTER_BITRATE_PCT, VP8E_SET_GF_CBR_BOOST_PCT, VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY, - VP9E_SET_TUNE_CONTENT, + VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE, 0 }; #endif |