diff options
64 files changed, 2070 insertions, 604 deletions
@@ -76,7 +76,6 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc - ppc64-linux-gcc ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc diff --git a/build/make/configure.sh b/build/make/configure.sh index c4e3b5141..480b2d0ea 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -719,11 +719,8 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; - power*64*-*) - tgt_isa=ppc64 - ;; - power*) - tgt_isa=ppc + power*64le*-*) + tgt_isa=ppc64le ;; *mips64el*) tgt_isa=mips64 @@ -835,7 +832,7 @@ process_common_toolchain() { IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi # Handle darwin variants. Newer SDKs allow targeting older @@ -1221,7 +1218,7 @@ EOF check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; - ppc*) + ppc64le*) link_with_cc=gcc setup_gnu_toolchain check_gcc_machine_option "vsx" diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh index 365a8c013..2442a282d 100755 --- a/build/make/iosbuild.sh +++ b/build/make/iosbuild.sh @@ -132,7 +132,8 @@ create_vpx_framework_config_shim() { done # Consume the last line of output from the loop: We don't want it. - sed -i '' -e '$d' "${config_file}" + sed -i.bak -e '$d' "${config_file}" + rm "${config_file}.bak" printf "#endif\n\n" >> "${config_file}" printf "#endif // ${include_guard}" >> "${config_file}" @@ -350,7 +351,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi if [ "${VERBOSE}" = "yes" ]; then @@ -116,7 +116,6 @@ all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" -all_platforms="${all_platforms} ppc64-linux-gcc" all_platforms="${all_platforms} ppc64le-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" @@ -328,6 +327,7 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising vp9_temporal_denoising + consistent_recode coefficient_range_checking vp9_highbitdepth better_hw_compatibility @@ -389,6 +389,7 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising vp9_temporal_denoising + consistent_recode coefficient_range_checking better_hw_compatibility vp9_highbitdepth @@ -521,7 +522,7 @@ process_detect() { # here rather than at option parse time because the target auto-detect # magic happens after the command line has been parsed. case "${tgt_os}" in - linux|os2|darwin*|iphonesimulator*) + linux|os2|solaris|darwin*|iphonesimulator*) # Supported platforms ;; *) @@ -593,6 +594,10 @@ EOF check_header unistd.h # for sysconf(3) and friends. check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports + + if enabled neon && ! enabled external_build; then + check_header arm_neon.h || die "Unable to find arm_neon.h" + fi } process_toolchain() { @@ -699,7 +704,7 @@ process_toolchain() { soft_enable libyuv ;; *-android-*) - soft_enable webm_io + check_add_cxxflags -std=c++11 && soft_enable webm_io soft_enable libyuv # GTestLog must be modified to use Android logging utilities. ;; @@ -708,7 +713,7 @@ process_toolchain() { # x86 targets. ;; *-iphonesimulator-*) - soft_enable webm_io + check_add_cxxflags -std=c++11 && soft_enable webm_io soft_enable libyuv ;; *-win*) @@ -718,9 +723,7 @@ process_toolchain() { check_cxx "$@" <<EOF && soft_enable unit_tests int z; EOF - check_cxx "$@" <<EOF && soft_enable webm_io -int z; -EOF + check_add_cxxflags -std=c++11 && soft_enable webm_io check_cxx "$@" <<EOF && soft_enable libyuv int z; EOF @@ -729,9 +732,7 @@ EOF enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests int z; EOF - check_cxx "$@" <<EOF && soft_enable webm_io -int z; -EOF + check_add_cxxflags -std=c++11 && soft_enable webm_io check_cxx "$@" <<EOF && soft_enable libyuv int z; EOF diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 091c6954d..3fd961bdc 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -622,6 +622,7 @@ int main(int argc, const char **argv) { vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t enc_cfg; SvcContext svc_ctx; + vpx_svc_frame_drop_t svc_drop_frame; uint32_t i; uint32_t frame_cnt = 0; vpx_image_t raw; @@ -732,6 +733,12 @@ int main(int argc, const char **argv) { vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh; + svc_drop_frame.max_consec_drop = INT_MAX; + vpx_codec_control(&codec, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + // Encode frames while (!end_of_stream) { vpx_codec_iter_t iter = NULL; @@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, size_t frame_size = 0; if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) warn("Failed to read frame size\n"); + if (!feof(infile)) warn("Failed to read frame size"); } else { frame_size = mem_get_le32(raw_header); if (frame_size > 256 * 1024 * 1024) { - warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + warn("Read invalid frame size (%u)", (unsigned int)frame_size); frame_size = 0; } @@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, *buffer = new_buffer; *buffer_size = 2 * frame_size; } else { - warn("Failed to allocate compressed data buffer\n"); + warn("Failed to allocate compressed data buffer"); frame_size = 0; } } @@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, if (!feof(infile)) { if (fread(*buffer, 1, frame_size, infile) != frame_size) { - warn("Failed to read full frame\n"); + warn("Failed to read full frame"); return 1; } @@ -282,18 +282,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE) -libvpx.ver: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)echo "{ global:" > $@ - $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done - $(qexec)echo "local: *; };" >> $@ -CLEAN-OBJS += libvpx.ver - -libvpx.syms: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)awk '{print "_"$$2}' $^ >$@ -CLEAN-OBJS += libvpx.syms - libvpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@ @@ -353,6 +341,18 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc CLEAN-OBJS += vpx.pc endif +libvpx.ver: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo "{ global:" > $@ + $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done + $(qexec)echo "local: *; };" >> $@ +CLEAN-OBJS += libvpx.ver + +libvpx.syms: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)awk '{print "_"$$2}' $^ >$@ +CLEAN-OBJS += libvpx.syms + # # Rule to make assembler configuration file from C configuration file # diff --git a/test/bench.cc b/test/bench.cc new file mode 100644 index 000000000..281b7411d --- /dev/null +++ b/test/bench.cc @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdio.h> +#include <algorithm> + +#include "test/bench.h" +#include "vpx_ports/vpx_timer.h" + +void AbstractBench::runNTimes(int n) { + for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < n; ++j) { + run(); + } + vpx_usec_timer_mark(&timer); + times[r] = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + } +} + +void AbstractBench::printMedian(const char *title) { + std::sort(times, times + VPX_BENCH_ROBUST_ITER); + const int med = times[VPX_BENCH_ROBUST_ITER >> 1]; + int sad = 0; + for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) { + sad += abs(times[t] - med); + } + printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0, + sad / (VPX_BENCH_ROBUST_ITER * 1000.0)); +} diff --git a/test/bench.h b/test/bench.h new file mode 100644 index 000000000..0b0cf10a4 --- /dev/null +++ b/test/bench.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_BENCH_H_ +#define TEST_BENCH_H_ + +// Number of iterations used to compute median run time. +#define VPX_BENCH_ROBUST_ITER 15 + +class AbstractBench { + public: + void runNTimes(int n); + void printMedian(const char *title); + + protected: + // Implement this method and put the code to benchmark in it. + virtual void run() = 0; + + private: + int times[VPX_BENCH_ROBUST_ITER]; +}; + +#endif // TEST_BENCH_H_ diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index 404b5b44f..34e35b065 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -152,5 +152,5 @@ VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest, ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(0, 9)); + ::testing::Range(0, 10)); } // namespace diff --git a/test/dct_test.cc b/test/dct_test.cc index 10062150f..e8ad0cd5d 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -725,4 +725,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo wht_vsx_func_info = { + &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1 +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc index 0bb435502..142d9e2da 100644 --- a/test/encode_perf_test.cc +++ b/test/encode_perf_test.cc @@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), }; -const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 }; +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 }; const int kEncodePerfTestThreads[] = { 1, 2, 4 }; #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc index 63e972a00..b2cbc3f05 100644 --- a/test/encode_test_driver.cc +++ b/test/encode_test_driver.cc @@ -201,7 +201,7 @@ void EncoderTest::RunLoop(VideoSource *video) { PreEncodeFrameHook(video, encoder.get()); encoder->EncodeFrame(video, frame_flags_); - PostEncodeFrameHook(); + PostEncodeFrameHook(encoder.get()); CxDataIterator iter = encoder->GetCxData(); diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index a301e21cc..03624d110 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -226,7 +226,7 @@ class EncoderTest { virtual void PreEncodeFrameHook(VideoSource * /*video*/, Encoder * /*encoder*/) {} - virtual void PostEncodeFrameHook() {} + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} // Hook to be called on every compressed data packet. virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 43a4c6929..9cfaa1f1f 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -124,6 +124,7 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } const DecodeParam kVP8InvalidFileTests[] = { { 1, "invalid-bug-1443.ivf" }, { 1, "invalid-token-partition.ivf" }, + { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" }, }; VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index 5862d2649..4b5d55469 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -16,7 +16,7 @@ #include "test/video_source.h" namespace libvpx_test { -const unsigned int kCodeBufferSize = 256 * 1024; +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; const unsigned int kIvfFileHdrSize = 32; const unsigned int kIvfFrameHdrSize = 12; diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index 5a2ade1ef..1fe0348fc 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -11,6 +11,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -32,7 +33,6 @@ typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows, int cols, int flimit); namespace { - // Compute the filter level used in post proc from the loop filter strength int q2mbl(int x) { if (x < 20) x = 20; @@ -42,18 +42,36 @@ int q2mbl(int x) { } class VpxPostProcDownAndAcrossMbRowTest - : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> { + : public AbstractBench, + public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> { public: + VpxPostProcDownAndAcrossMbRowTest() : mbPostProcDownAndAcross(GetParam()) {} virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + const VpxPostProcDownAndAcrossMbRowFunc mbPostProcDownAndAcross; + // Size of the underlying data block that will be filtered. + int block_width; + int block_height; + Buffer<uint8_t> *src_image; + Buffer<uint8_t> *dst_image; + uint8_t *flimits; + void run(); }; +void VpxPostProcDownAndAcrossMbRowTest::run() { + mbPostProcDownAndAcross(src_image->TopLeftPixel(), dst_image->TopLeftPixel(), + src_image->stride(), dst_image->stride(), block_width, + flimits, 16); +} + // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. - const int block_width = 16; - const int block_height = 16; + block_width = 16; + block_height = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2); @@ -66,8 +84,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8); ASSERT_TRUE(dst_image.Init()); - uint8_t *const flimits = - reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); + flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); (void)memset(flimits, 255, block_width); // Initialize pixels in the input: @@ -79,13 +96,12 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Initialize pixels in the output to 99. dst_image.Set(99); - ASM_REGISTER_STATE_CHECK(GetParam()( + ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross( src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), dst_image.stride(), block_width, flimits, 16)); - static const uint8_t kExpectedOutput[block_height] = { - 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 - }; + static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 4 }; uint8_t *pixel_ptr = dst_image.TopLeftPixel(); for (int i = 0; i < block_height; ++i) { @@ -103,8 +119,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // Size of the underlying data block that will be filtered. // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V // blocks are always a multiple of 8 wide and exactly 8 high. - const int block_width = 136; - const int block_height = 16; + block_width = 136; + block_height = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. @@ -127,8 +143,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so // it must be padded out. const int flimits_width = block_width % 16 ? block_width + 8 : block_width; - uint8_t *const flimits = - reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width)); + flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width)); ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); @@ -143,7 +158,6 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { for (int f = 0; f < 255; f++) { (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); - dst_image.Set(0); dst_image_ref.Set(0); @@ -151,10 +165,10 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), src_image.stride(), dst_image_ref.stride(), block_width, flimits, block_height); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(), - src_image.stride(), dst_image.stride(), block_width, - flimits, block_height)); + ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width, flimits, + block_height)); ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); } @@ -163,12 +177,58 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { vpx_free(flimits); } +TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { + // Size of the underlying data block that will be filtered. + block_width = 16; + block_height = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2); + ASSERT_TRUE(src_image.Init()); + this->src_image = &src_image; + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer<uint8_t> dst_image = + Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + this->dst_image = &dst_image; + + flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); + (void)memset(flimits, 255, block_width); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + runNTimes(INT16_MAX); + printMedian("16x16"); + + vpx_free(flimits); +}; + class VpxMbPostProcAcrossIpTest - : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> { + : public AbstractBench, + public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> { public: + VpxMbPostProcAcrossIpTest() + : rows(16), cols(16), mbPostProcAcrossIp(GetParam()), + src(Buffer<uint8_t>(rows, cols, 8, 8, 17, 8)) {} virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: + const int rows; + const int cols; + const VpxMbPostProcAcrossIpFunc mbPostProcAcrossIp; + Buffer<uint8_t> src; + void run(); + void SetCols(unsigned char *s, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { @@ -197,11 +257,11 @@ class VpxMbPostProcAcrossIpTest } }; -TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; +void VpxMbPostProcAcrossIpTest::run() { + mbPostProcAcrossIp(src.TopLeftPixel(), src.stride(), rows, cols, q2mbl(0)); +} - Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); +TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); @@ -215,15 +275,11 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { } TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; - - Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13 }; @@ -232,15 +288,11 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { } TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; - - Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); ASSERT_TRUE(src.Init()); src.SetPadding(10); SetCols(src.TopLeftPixel(), rows, cols, src.stride()); - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13 }; @@ -254,9 +306,6 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { } TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); ASSERT_TRUE(c_mem.Init()); Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8); @@ -279,12 +328,33 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { } } +TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) { + ASSERT_TRUE(src.Init()); + src.SetPadding(10); + + SetCols(src.TopLeftPixel(), rows, cols, src.stride()); + + runNTimes(100000); + printMedian("16x16"); +} + class VpxMbPostProcDownTest - : public ::testing::TestWithParam<VpxMbPostProcDownFunc> { + : public AbstractBench, + public ::testing::TestWithParam<VpxMbPostProcDownFunc> { public: + VpxMbPostProcDownTest() + : rows(16), cols(16), mbPostProcDown(GetParam()), + src_c(Buffer<uint8_t>(rows, cols, 8, 8, 8, 17)) {} + virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: + const int rows; + const int cols; + const VpxMbPostProcDownFunc mbPostProcDown; + Buffer<uint8_t> src_c; + void run(); + void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { memset(src_c, r, cols); @@ -306,22 +376,22 @@ class VpxMbPostProcDownTest void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, int filter_level, const unsigned char *expected_output) { ASM_REGISTER_STATE_CHECK( - GetParam()(s, src_width, rows, cols, filter_level)); + mbPostProcDown(s, src_width, rows, cols, filter_level)); RunComparison(expected_output, s, rows, cols, src_width); } }; -TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; +void VpxMbPostProcDownTest::run() { + mbPostProcDown(src_c.TopLeftPixel(), src_c.stride(), rows, cols, q2mbl(0)); +} - Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); +TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 3, 3, 3, @@ -348,16 +418,12 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { } TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; - - Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -379,10 +445,6 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { } TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; - - Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); ASSERT_TRUE(src_c.Init()); src_c.SetPadding(10); @@ -399,13 +461,9 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { } TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); - Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); ASSERT_TRUE(src_c.Init()); Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17); ASSERT_TRUE(src_asm.Init()); @@ -418,7 +476,7 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, q2mbl(level)); - ASM_REGISTER_STATE_CHECK(GetParam()( + ASM_REGISTER_STATE_CHECK(mbPostProcDown( src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); ASSERT_TRUE(src_asm.CheckValues(src_c)); @@ -429,12 +487,22 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols, q2mbl(level)); - ASM_REGISTER_STATE_CHECK(GetParam()( + ASM_REGISTER_STATE_CHECK(mbPostProcDown( src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level))); ASSERT_TRUE(src_asm.CheckValues(src_c)); } } +TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) { + ASSERT_TRUE(src_c.Init()); + src_c.SetPadding(10); + + SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride()); + + runNTimes(100000); + printMedian("16x16"); +} + INSTANTIATE_TEST_CASE_P( C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); @@ -481,4 +549,16 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, ::testing::Values(vpx_mbpost_proc_down_msa)); #endif // HAVE_MSA +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P( + VSX, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); + +INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); + +INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_vsx)); +#endif // HAVE_VSX + } // namespace diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc index be3a1969c..b9fbd8f4f 100644 --- a/test/svc_datarate_test.cc +++ b/test/svc_datarate_test.cc @@ -115,6 +115,8 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { key_frame_spacing_ = 9999; num_nonref_frames_ = 0; layer_framedrop_ = 0; + force_key_ = 0; + force_key_test_ = 0; } virtual void BeginPassHook(unsigned int /*pass*/) {} @@ -203,6 +205,7 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { svc_drop_frame.framedrop_mode = LAYER_DROP; for (i = 0; i < number_spatial_layers_; i++) svc_drop_frame.framedrop_thresh[i] = 30; + svc_drop_frame.max_consec_drop = 30; encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); } } @@ -268,7 +271,7 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { } if (dynamic_drop_layer_) { - if (video->frame() == 50) { + if (video->frame() == 0) { // Change layer bitrates to set top layers to 0. This will trigger skip // encoding/dropping of top two spatial layers. cfg_.rc_target_bitrate -= @@ -278,7 +281,25 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { cfg_.layer_target_bitrate[1] = 0; cfg_.layer_target_bitrate[2] = 0; encoder->Config(&cfg_); + } else if (video->frame() == 50) { + // Change layer bitrates to non-zero on two top spatial layers. + // This will trigger skip encoding of top two spatial layers. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += + cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); } else if (video->frame() == 100) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 150) { // Change layer bitrate on second layer to non-zero to start // encoding it again. cfg_.layer_target_bitrate[1] = middle_bitrate_; @@ -292,12 +313,21 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { encoder->Config(&cfg_); } } + + if (force_key_test_ && force_key_) + frame_flags_ = VPX_EFLAG_FORCE_KF; + else + frame_flags_ = 0; + const vpx_rational_t tb = video->timebase(); timebase_ = static_cast<double>(tb.num) / tb.den; duration_ = 0; } - virtual void PostEncodeFrameHook() { + virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; for (int sl = 0; sl < number_spatial_layers_; ++sl) { for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { const int layer = sl * number_temporal_layers_ + tl; @@ -366,13 +396,19 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { // In the constrained frame drop mode, if a given spatial is dropped all // upper layers must be dropped too. if (!layer_framedrop_) { + int num_layers_dropped = 0; for (int sl = 0; sl < number_spatial_layers_; ++sl) { if (!pkt->data.frame.spatial_layer_encoded[sl]) { // Check that all upper layers are dropped. + num_layers_dropped++; for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2) ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0); } } + if (num_layers_dropped == number_spatial_layers_ - 1) + force_key_ = 1; + else + force_key_ = 0; } // Keep track of number of non-reference frames, needed for mismatch check. // Non-reference frames are top spatial and temporal layer frames, @@ -461,6 +497,8 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest { int key_frame_spacing_; unsigned int num_nonref_frames_; int layer_framedrop_; + int force_key_; + int force_key_test_; }; // Params: speed setting. @@ -528,6 +566,53 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) { } // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers, with force key frame after frame drop +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 3; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 100; + ResetModel(); + AssignLayerBitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(num_nonref_frames_, GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and // 3 temporal layers. Run CIF clip with 1 thread. TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL) { cfg_.rc_buf_initial_sz = 500; @@ -711,9 +796,9 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) { cfg_.ts_number_layers, cfg_.temporal_layering_mode, layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - // Don't check rate targeting on top spatial layer since it will be skipped - // for part of the sequence. - CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 1, + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 2, number_temporal_layers_, file_datarate_, 0.78, 1.15); #if CONFIG_VP9_DECODER // The non-reference frames are expected to be mismatched frames as the @@ -848,7 +933,7 @@ TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) { layer_target_avg_bandwidth_, bits_in_buffer_model_); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); CheckLayerRateTargeting(&cfg_, number_spatial_layers_, - number_temporal_layers_, file_datarate_, 0.75, 1.2); + number_temporal_layers_, file_datarate_, 0.75, 1.45); #if CONFIG_VP9_DECODER // The non-reference frames are expected to be mismatched frames as the // encoder will avoid loopfilter on these frames. @@ -1147,20 +1232,21 @@ TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) { } VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSingleBR, - ::testing::Range(5, 9)); + ::testing::Range(5, 10)); -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 9), +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 10), ::testing::Range(0, 3)); VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcFrameDropMultiBR, - ::testing::Range(5, 9), ::testing::Range(0, 2), + ::testing::Range(5, 10), ::testing::Range(0, 2), ::testing::Range(0, 3)); #if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser, ::testing::Range(5, 9), - ::testing::Range(1, 3), ::testing::Range(0, 3)); +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser, + ::testing::Range(5, 10), ::testing::Range(1, 3), + ::testing::Range(0, 3)); #endif -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 9), +VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 10), ::testing::Range(32, 36)); } // namespace diff --git a/test/test-data.mk b/test/test-data.mk index 7ca11bc9c..4be6c66ff 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -738,6 +738,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 3a23ff5db..9cb9d5864 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -856,3 +856,5 @@ fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res 90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 +a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res diff --git a/test/test.mk b/test/test.mk index 3e5739e21..224ac4e8f 100644 --- a/test/test.mk +++ b/test/test.mk @@ -1,4 +1,6 @@ LIBVPX_TEST_SRCS-yes += acm_random.h +LIBVPX_TEST_SRCS-yes += bench.h +LIBVPX_TEST_SRCS-yes += bench.cc LIBVPX_TEST_SRCS-yes += buffer.h LIBVPX_TEST_SRCS-yes += clear_system_state.h LIBVPX_TEST_SRCS-yes += codec_factory.h diff --git a/test/variance_test.cc b/test/variance_test.cc index 725821ae6..fce7a1475 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1533,6 +1533,27 @@ INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest, INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_vsx))); +INSTANTIATE_TEST_CASE_P(VSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), + MseParams(4, 3, &vpx_mse16x8_vsx), + MseParams(3, 4, &vpx_mse8x16_vsx), + MseParams(3, 3, &vpx_mse8x8_vsx))); + +INSTANTIATE_TEST_CASE_P( + VSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx), + VarianceParams(6, 5, &vpx_variance64x32_vsx), + VarianceParams(5, 6, &vpx_variance32x64_vsx), + VarianceParams(5, 5, &vpx_variance32x32_vsx), + VarianceParams(5, 4, &vpx_variance32x16_vsx), + VarianceParams(4, 5, &vpx_variance16x32_vsx), + VarianceParams(4, 4, &vpx_variance16x16_vsx), + VarianceParams(4, 3, &vpx_variance16x8_vsx), + VarianceParams(3, 4, &vpx_variance8x16_vsx), + VarianceParams(3, 3, &vpx_variance8x8_vsx), + VarianceParams(3, 2, &vpx_variance8x4_vsx), + VarianceParams(2, 3, &vpx_variance4x8_vsx), + VarianceParams(2, 2, &vpx_variance4x4_vsx))); #endif // HAVE_VSX #if HAVE_MMI diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc index c4dbcacbe..a8bcc2a43 100644 --- a/test/vp9_datarate_test.cc +++ b/test/vp9_datarate_test.cc @@ -266,7 +266,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36) << " The datarate for the file is greater than target by too much!"; } @@ -294,7 +294,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30) + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) << " The datarate for the file is greater than target by too much!"; } @@ -824,16 +824,17 @@ TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) { VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large, ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(2, 9), ::testing::Range(0, 4)); + ::testing::Range(2, 10), ::testing::Range(0, 4)); VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeOneBR, ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(2, 9)); + ::testing::Range(2, 10)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 9)); +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); #if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, ::testing::Range(5, 9)); +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, + ::testing::Range(5, 10)); #endif } // namespace diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 6b7e51211..44659904f 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -409,7 +409,7 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(3, 9), // cpu_used + ::testing::Range(3, 10), // cpu_used ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5))); // threads diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index f0bbedbfa..c39267faa 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -18,6 +18,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -67,10 +68,13 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block, scan, iscan); } -class VP9QuantizeBase { +class VP9QuantizeBase : public AbstractBench { public: VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) - : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) { + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp), + coeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 16)), + qcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)), + dqcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)) { max_value_ = (1 << bit_depth_) - 1; zbin_ptr_ = reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); @@ -86,6 +90,9 @@ class VP9QuantizeBase { vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); dequant_ptr_ = reinterpret_cast<int16_t *>( vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); + + r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; + q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; } ~VP9QuantizeBase() { @@ -118,6 +125,15 @@ class VP9QuantizeBase { int max_value_; const int max_size_; const bool is_fp_; + Buffer<tran_low_t> coeff; + Buffer<tran_low_t> qcoeff; + Buffer<tran_low_t> dqcoeff; + int16_t *r_ptr; + int16_t *q_ptr; + int count; + int skip_block; + const scan_order *scan; + uint16_t eob; }; class VP9QuantizeTest : public VP9QuantizeBase, @@ -128,10 +144,17 @@ class VP9QuantizeTest : public VP9QuantizeBase, quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: + void run(); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; +void VP9QuantizeTest::run() { + quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, + quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), + dequant_ptr_, &eob, scan->scan, scan->iscan); +} + // This quantizer compares the AC coefficients to the quantization step size to // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). @@ -269,11 +292,8 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); ASSERT_TRUE(coeff.Init()); - Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(qcoeff.Init()); - Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(dqcoeff.Init()); Buffer<tran_low_t> ref_qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); @@ -281,7 +301,8 @@ TEST_P(VP9QuantizeTest, OperationCheck) { Buffer<tran_low_t> ref_dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_dqcoeff.Init()); - uint16_t eob, ref_eob; + uint16_t ref_eob = 0; + eob = 0; for (int i = 0; i < number_of_iterations; ++i) { // Test skip block for the first three iterations to catch all the different @@ -294,23 +315,21 @@ TEST_P(VP9QuantizeTest, OperationCheck) { sz = TX_32X32; } const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); + scan = &vp9_scan_orders[sz][tx_type]; + count = (4 << sz) * (4 << sz); coeff.Set(&rnd, -max_value_, max_value_); GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_order->scan, scan_order->iscan); + scan->scan, scan->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), - dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + dequant_ptr_, &eob, scan->scan, scan->iscan)); EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); @@ -328,11 +347,8 @@ TEST_P(VP9QuantizeTest, OperationCheck) { TEST_P(VP9QuantizeTest, EOBCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); ASSERT_TRUE(coeff.Init()); - Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(qcoeff.Init()); - Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(dqcoeff.Init()); Buffer<tran_low_t> ref_qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); @@ -340,10 +356,12 @@ TEST_P(VP9QuantizeTest, EOBCheck) { Buffer<tran_low_t> ref_dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(ref_dqcoeff.Init()); - uint16_t eob, ref_eob; + uint16_t ref_eob = 0; + eob = 0; + const uint32_t max_index = max_size_ * max_size_ - 1; for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = 0; + skip_block = 0; TX_SIZE sz; if (max_size_ == 16) { sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16 @@ -351,28 +369,26 @@ TEST_P(VP9QuantizeTest, EOBCheck) { sz = TX_32X32; } const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); + scan = &vp9_scan_orders[sz][tx_type]; + count = (4 << sz) * (4 << sz); // Two random entries coeff.Set(0); - coeff.TopLeftPixel()[rnd(count)] = + coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] = static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_; - coeff.TopLeftPixel()[rnd(count)] = + coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] = static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_; GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, - scan_order->scan, scan_order->iscan); + scan->scan, scan->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(), - dequant_ptr_, &eob, scan_order->scan, scan_order->iscan)); + dequant_ptr_, &eob, scan->scan, scan->iscan)); EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff)); EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff)); @@ -390,13 +406,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) { TEST_P(VP9QuantizeTest, DISABLED_Speed) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16); ASSERT_TRUE(coeff.Init()); - Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(qcoeff.Init()); - Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32); ASSERT_TRUE(dqcoeff.Init()); - uint16_t eob; TX_SIZE starting_sz, ending_sz; if (max_size_ == 16) { @@ -410,18 +422,16 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) { for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { // zbin > coeff, zbin < coeff. for (int i = 0; i < 2; ++i) { - const int skip_block = 0; + skip_block = 0; // TX_TYPE defines the scan order. That is not relevant to the speed test. // Pick the first one. const TX_TYPE tx_type = DCT_DCT; - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); + count = (4 << sz) * (4 << sz); + scan = &vp9_scan_orders[sz][tx_type]; GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, quant_fp_ptr_); - int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_; - int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; if (i == 0) { // When |coeff values| are less than zbin the results are 0. @@ -438,22 +448,15 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) { coeff.Set(&rnd, -500, 500); } - vpx_usec_timer timer; - vpx_usec_timer_start(&timer); - for (int j = 0; j < 100000000 / count; ++j) { - quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, - q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(), - dqcoeff.TopLeftPixel(), dequant_ptr_, &eob, - scan_order->scan, scan_order->iscan); - } - vpx_usec_timer_mark(&timer); - const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); - if (i == 0) printf("Bypass calculations.\n"); - if (i == 1) printf("Full calculations.\n"); - printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz, - elapsed_time / 1000); + runNTimes(10000000 / count); + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + printMedian(title); } - printf("\n"); } } @@ -557,6 +560,16 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8, 32, true))); #endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_vsx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_vsx, + &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH + // Only useful to compare "Speed" test results. INSTANTIATE_TEST_CASE_P( DISABLED_C, VP9QuantizeTest, @@ -575,10 +588,3 @@ INSTANTIATE_TEST_CASE_P( &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, true))); } // namespace - -#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_vsx, - &vpx_quantize_b_c, - VPX_BITS_8, 16, false))); -#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk index 8149a083f..b46ba101d 100644 --- a/third_party/libwebm/Android.mk +++ b/third_party/libwebm/Android.mk @@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE:= libwebm LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat +LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11 LOCAL_C_INCLUDES:= $(LOCAL_PATH) LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH) diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx index ebb5ff2f4..6d8b0b4cc 100644 --- a/third_party/libwebm/README.libvpx +++ b/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74 +Version: af81f26025b7435fa9a14ad07c58b44cf9280430 License: BSD License File: LICENSE.txt @@ -7,4 +7,14 @@ Description: libwebm is used to handle WebM container I/O. Local Changes: -* <none> +Only keep: + - Android.mk + - AUTHORS.TXT + - common/ + file_util.cc/h + hdr_util.cc/h + webmids.h + - LICENSE.TXT + - mkvmuxer/ + - mkvparser/ + - PATENTS.TXT diff --git a/third_party/libwebm/common/file_util.cc b/third_party/libwebm/common/file_util.cc index 6dab146dd..618ffc087 100644 --- a/third_party/libwebm/common/file_util.cc +++ b/third_party/libwebm/common/file_util.cc @@ -17,6 +17,7 @@ #include <cstring> #include <fstream> #include <ios> +#include <string> namespace libwebm { @@ -41,7 +42,12 @@ std::string GetTempFileName() { return temp_file_name; #else char tmp_file_name[_MAX_PATH]; +#if defined _MSC_VER || defined MINGW_HAS_SECURE_API errno_t err = tmpnam_s(tmp_file_name); +#else + char* fname_pointer = tmpnam(tmp_file_name); + errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; +#endif if (err == 0) { return std::string(tmp_file_name); } @@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) { return file_size; } +bool GetFileContents(const std::string& file_name, std::string* contents) { + std::ifstream file(file_name.c_str()); + *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0); + if (file.good() && contents->size()) { + file.read(&(*contents)[0], contents->size()); + } + return !file.fail(); +} + TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); } TempFileDeleter::~TempFileDeleter() { diff --git a/third_party/libwebm/common/file_util.h b/third_party/libwebm/common/file_util.h index 0e71eac11..a87373464 100644 --- a/third_party/libwebm/common/file_util.h +++ b/third_party/libwebm/common/file_util.h @@ -22,6 +22,9 @@ std::string GetTempFileName(); // Returns size of file specified by |file_name|, or 0 upon failure. uint64_t GetFileSize(const std::string& file_name); +// Gets the contents file_name as a string. Returns false on error. +bool GetFileContents(const std::string& file_name, std::string* contents); + // Manages life of temporary file specified at time of construction. Deletes // file upon destruction. class TempFileDeleter { @@ -38,4 +41,4 @@ class TempFileDeleter { } // namespace libwebm -#endif // LIBWEBM_COMMON_FILE_UTIL_H_
\ No newline at end of file +#endif // LIBWEBM_COMMON_FILE_UTIL_H_ diff --git a/third_party/libwebm/common/hdr_util.cc b/third_party/libwebm/common/hdr_util.cc index e1618ce75..916f7170b 100644 --- a/third_party/libwebm/common/hdr_util.cc +++ b/third_party/libwebm/common/hdr_util.cc @@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, if (MasteringMetadataValuePresent(parser_mm.luminance_min)) muxer_mm->set_luminance_min(parser_mm.luminance_min); - PrimaryChromaticityPtr r_ptr(NULL); - PrimaryChromaticityPtr g_ptr(NULL); - PrimaryChromaticityPtr b_ptr(NULL); - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); + PrimaryChromaticityPtr g_ptr(nullptr); + PrimaryChromaticityPtr b_ptr(nullptr); + PrimaryChromaticityPtr wp_ptr(nullptr); if (parser_mm.r) { if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr)) diff --git a/third_party/libwebm/common/hdr_util.h b/third_party/libwebm/common/hdr_util.h index 3ef5388fd..78e2eeb70 100644 --- a/third_party/libwebm/common/hdr_util.h +++ b/third_party/libwebm/common/hdr_util.h @@ -47,15 +47,7 @@ struct Vp9CodecFeatures { int chroma_subsampling; }; -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif -typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr; -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic pop -#endif +typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr; bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc); diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 15b9a908d..481771db2 100644 --- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvmuxer.h" +#include <stdint.h> + #include <cfloat> #include <climits> #include <cstdio> @@ -24,11 +26,6 @@ #include "mkvmuxer/mkvwriter.h" #include "mkvparser/mkvparser.h" -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - namespace mkvmuxer { const float PrimaryChromaticity::kChromaticityMin = 0.0f; @@ -72,7 +69,7 @@ bool StrCpy(const char* src, char** dst_ptr) { return true; } -typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr; +typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr; bool CopyChromaticity(const PrimaryChromaticity* src, PrimaryChromaticityPtr* dst) { if (!dst) @@ -1057,22 +1054,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { bool MasteringMetadata::SetChromaticity( const PrimaryChromaticity* r, const PrimaryChromaticity* g, const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) { - PrimaryChromaticityPtr r_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); if (r) { if (!CopyChromaticity(r, &r_ptr)) return false; } - PrimaryChromaticityPtr g_ptr(NULL); + PrimaryChromaticityPtr g_ptr(nullptr); if (g) { if (!CopyChromaticity(g, &g_ptr)) return false; } - PrimaryChromaticityPtr b_ptr(NULL); + PrimaryChromaticityPtr b_ptr(nullptr); if (b) { if (!CopyChromaticity(b, &b_ptr)) return false; } - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr wp_ptr(nullptr); if (white_point) { if (!CopyChromaticity(white_point, &wp_ptr)) return false; @@ -1238,7 +1235,7 @@ bool Colour::Write(IMkvWriter* writer) const { } bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { - std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata()); + std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -1546,7 +1543,7 @@ bool VideoTrack::Write(IMkvWriter* writer) const { } bool VideoTrack::SetColour(const Colour& colour) { - std::auto_ptr<Colour> colour_ptr(new Colour()); + std::unique_ptr<Colour> colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -1574,7 +1571,7 @@ bool VideoTrack::SetColour(const Colour& colour) { } bool VideoTrack::SetProjection(const Projection& projection) { - std::auto_ptr<Projection> projection_ptr(new Projection()); + std::unique_ptr<Projection> projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -2666,7 +2663,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) { // and write it if it is okay to do so (i.e.) no other track has an held back // frame with timestamp <= the timestamp of the frame in question. std::vector<std::list<Frame*>::iterator> frames_to_erase; - for (std::list<Frame *>::iterator + for (std::list<Frame*>::iterator current_track_iterator = stored_frames_[track_number].begin(), end = --stored_frames_[track_number].end(); current_track_iterator != end; ++current_track_iterator) { diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc index 37f230d0a..e7b76f7da 100644 --- a/third_party/libwebm/mkvparser/mkvparser.cc +++ b/third_party/libwebm/mkvparser/mkvparser.cc @@ -22,12 +22,8 @@ #include "common/webmids.h" -// disable deprecation warnings for auto_ptr -#if defined(__GNUC__) && __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - namespace mkvparser { +const long long kStringElementSizeLimit = 20 * 1000 * 1000; const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; const float Projection::kValueNotPresent = FLT_MAX; @@ -330,7 +326,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size, delete[] str; str = NULL; - if (size >= LONG_MAX || size < 0) + if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit) return E_FILE_FORMAT_INVALID; // +1 for '\0' terminator @@ -5015,7 +5011,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, if (!reader || *mm) return false; - std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata()); + std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -5035,6 +5031,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_max = static_cast<float>(value); if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 || mm_ptr->luminance_max > 9999.99) { @@ -5044,6 +5044,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_min = static_cast<float>(value); if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 || mm_ptr->luminance_min > 999.9999) { @@ -5096,7 +5100,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start, if (!reader || *colour) return false; - std::auto_ptr<Colour> colour_ptr(new Colour()); + std::unique_ptr<Colour> colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -5194,7 +5198,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, if (!reader || *projection) return false; - std::auto_ptr<Projection> projection_ptr(new Projection()); + std::unique_ptr<Projection> projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -7903,6 +7907,10 @@ long Block::Parse(const Cluster* pCluster) { return E_FILE_FORMAT_INVALID; curr.len = static_cast<long>(frame_size); + // Check if size + curr.len could overflow. + if (size > LLONG_MAX - curr.len) { + return E_FILE_FORMAT_INVALID; + } size += curr.len; // contribution of this frame --frame_count; @@ -7964,6 +7972,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const { const long long tc0 = pCluster->GetTimeCode(); assert(tc0 >= 0); + // Check if tc0 + m_timecode would overflow. + if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) { + return -1; + } + const long long tc = tc0 + m_timecode; return tc; // unscaled timecode units @@ -7981,6 +7994,10 @@ long long Block::GetTime(const Cluster* pCluster) const { const long long scale = pInfo->GetTimeCodeScale(); assert(scale >= 1); + // Check if tc * scale could overflow. + if (tc != 0 && scale > LLONG_MAX / tc) { + return -1; + } const long long ns = tc * scale; return ns; diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c index 5e8ca02b4..6c1d784d3 100644 --- a/tools/tiny_ssim.c +++ b/tools/tiny_ssim.c @@ -91,6 +91,7 @@ typedef struct input_file { int w; int h; int bit_depth; + int frame_size; } input_file_t; // Open a file and determine if its y4m or raw. If y4m get the header. @@ -119,10 +120,12 @@ static int open_input_file(const char *file_name, input_file_t *input, int w, fseek(input->file, 0, SEEK_SET); input->w = w; input->h = h; - if (bit_depth < 9) - input->buf = malloc(w * h * 3 / 2); - else - input->buf = malloc(w * h * 3); + // handle odd frame sizes + input->frame_size = w * h + ((w + 1) / 2 * (h + 1) / 2) * 2; + if (bit_depth > 8) { + input->frame_size *= 2; + input->buf = malloc(input->frame_size); + } break; } } @@ -150,15 +153,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y, break; case RAW_YUV: if (bd < 9) { - r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file); + r1 = fread(in->buf, in->frame_size, 1, in->file); *y = in->buf; *u = in->buf + in->w * in->h; - *v = in->buf + 5 * in->w * in->h / 4; + *v = *u + (1 + in->w) / 2 * (1 + in->h) / 2; } else { - r1 = fread(in->buf, in->w * in->h * 3, 1, in->file); + r1 = fread(in->buf, in->frame_size, 1, in->file); *y = in->buf; - *u = in->buf + in->w * in->h / 2; - *v = *u + in->w * in->h / 2; + *u = in->buf + (in->w * in->h) * 2; + *v = *u + 2 * ((1 + in->w) / 2 * (1 + in->h) / 2); } break; } @@ -325,7 +328,8 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, // (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) // // Replace c1 with n*n * c1 for the final step that leads to this code: -// The final step scales by 12 bits so we don't lose precision in the constants. +// The final step scales by 12 bits so we don't lose precision in the +// constants. static double ssimv_similarity(const Ssimv *sv, int64_t n) { // Scale the constants by number of pixels. @@ -628,9 +632,10 @@ int main(int argc, char *argv[]) { goto clean_up; } - // Number of frames to skip from file1.yuv for every frame used. Normal values - // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding - // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. + // Number of frames to skip from file1.yuv for every frame used. Normal + // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL + // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer + // encoding. if (argc > 4) { sscanf(argv[4], "%d", &tl_skip); if (argc > 5) { @@ -644,12 +649,6 @@ int main(int argc, char *argv[]) { } } - if (w & 1 || h & 1) { - fprintf(stderr, "Invalid size %dx%d\n", w, h); - return_value = 1; - goto clean_up; - } - while (1) { size_t r1, r2; unsigned char *y[2], *u[2], *v[2]; @@ -703,8 +702,10 @@ int main(int argc, char *argv[]) { psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv)); } psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h); - psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2); - psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2); + psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2, + (h + 1) / 2); + psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2, + (h + 1) / 2); n_frames++; } diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index d67ee8a57..8c292d616 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -65,7 +65,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); - const MODE_INFO *mode_info_context = cm->show_frame_mi; + const MODE_INFO *mode_info_context = cm->mi; int mbr, mbc; /* The pixel thresholds are adjusted according to if or not the macroblock diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 8bfd3cea3..0d54a9442 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -686,6 +686,12 @@ static unsigned int read_available_partition_size( const unsigned char *partition_size_ptr = token_part_sizes + i * 3; unsigned int partition_size = 0; ptrdiff_t bytes_left = fragment_end - fragment_start; + if (bytes_left < 0) { + vpx_internal_error( + &pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition. No bytes left %d.", + (int)bytes_left); + } /* Calculate the length of this partition. The last partition * size is implicit. If the partition size can't be read, then * either use the remaining data in the buffer (for EC mode) diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index b47840795..3b4fee7cf 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -428,7 +428,9 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { double weight_segment = 0; int thresh_low_motion = (cm->width < 720) ? 55 : 20; cr->apply_cyclic_refresh = 1; - if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + // TODO(jianj): Look into issue of cyclic refresh with high bitdepth. + if (cm->bit_depth > 8 || cm->frame_type == KEY_FRAME || + cpi->svc.temporal_layer_id > 0 || (cpi->use_svc && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && @@ -457,6 +459,15 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_boost_fac = 13; } } + // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and + // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2 + // (rate_boost_fac = 10 disables segment#2). + // TODO(marpan): Consider increasing refresh rate after slide change. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 2.0; + cr->rate_boost_fac = 10; + } // Adjust some parameters for low resolutions. if (cm->width <= 352 && cm->height <= 288) { if (rc->avg_frame_bandwidth < 3000) { @@ -587,3 +598,12 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; } + +void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // For now apply hard limit to frame-level decrease in q, if the cyclic + // refresh is active (percent_refresh > 0). + if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) { + *q = cpi->rc.q_1_frame - 8; + } +} diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h index 77fa67c9e..f59f193f6 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -139,6 +139,8 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { return CR_SEGMENT_ID_BASE; } +void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index c96dc3fbd..0b3eef7b3 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -556,6 +556,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, } else { thresholds[1] = (5 * threshold_base) >> 1; } + if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX; } } @@ -4877,6 +4878,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; +#if CONFIG_CONSISTENT_RECODE + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; +#endif tile_data->mode_map[i][j] = j; } } @@ -5001,7 +5005,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - +#if CONFIG_CONSISTENT_RECODE + x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; +#endif if (xd->lossless) x->optimize = 0; cm->tx_mode = select_tx_mode(cpi, xd); @@ -5126,9 +5132,48 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } +#if CONFIG_CONSISTENT_RECODE +static void restore_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes[i][j] = + rd_opt->prediction_type_threshes_prev[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = + tile_data->thresh_freq_fact_prev[i][j]; + } + } + } + } + + cm->interp_filter = cpi->sf.default_interp_filter; +} +#endif + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_CONSISTENT_RECODE + restore_encode_params(cpi); +#endif + // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a // different sign bias and that buffer is then the fixed ref. However, this @@ -5404,7 +5449,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); - if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0) + if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize); } } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 3384de7ea..fca8f331d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3024,23 +3024,28 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; if (cm->frame_type == KEY_FRAME) { int i; - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; // On key frame update all reference frame slots. for (i = 0; i < REF_FRAMES; i++) { + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; // LAST/GOLDEN/ALTREF is already updated above. if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx) ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); } } else { - if (cpi->refresh_last_frame) - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - if (cpi->refresh_golden_frame) - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - if (cpi->refresh_alt_ref_frame) - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; + if (cpi->refresh_last_frame) { + svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_golden_frame) { + svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_alt_ref_frame) { + svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id; + } } // Copy flags from encoder to SVC struct. vp9_copy_flags_ref_update_idx(cpi); @@ -3574,8 +3579,41 @@ static void set_frame_size(VP9_COMP *cpi) { set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +#if CONFIG_CONSISTENT_RECODE +static void save_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes_prev[i][j] = + rd_opt->prediction_type_threshes[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact_prev[i][j] = + tile_data->thresh_freq_fact[i][j]; + } + } + } + } +} +#endif + +static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, + uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. const INTERP_FILTER filter_scaler = @@ -3686,12 +3724,23 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (cm->show_frame && cpi->oxcf.mode == REALTIME && (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.content == VP9E_CONTENT_SCREEN || - (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc))) + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) vp9_scene_detection_onepass(cpi); if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = cpi->rc.high_source_sad; + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame, or if base layer is key for svc. + // Don't drop on scene change. + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME && !cpi->rc.high_source_sad && + !cpi->svc.high_source_sad_superframe && + (!cpi->use_svc || + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi)) return 0; + } + // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). @@ -3715,7 +3764,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } } - if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && + // TODO(jianj): Look into issue of skin detection with high bitdepth. + if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { @@ -3729,10 +3779,12 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); - // For SVC on non-zero spatial layer: check for disabling inter-layer - // prediction. - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) - vp9_svc_constrain_inter_layer_pred(cpi); + if (cpi->use_svc) { + // On non-zero spatial layer, check for disabling inter-layer + // prediction. + if (cpi->svc.spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi); + vp9_svc_assert_constraints_pattern(cpi); + } // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. @@ -3795,6 +3847,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // seen in the last encoder iteration. // update_base_skip_probs(cpi); vpx_clear_system_state(); + return 1; } #define MAX_QSTEP_ADJ 4 @@ -4485,11 +4538,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cpi->oxcf.target_bandwidth == 0) { cpi->svc.skip_enhancement_layer = 1; vp9_rc_postencode_update_drop_frame(cpi); - vp9_inc_frame_in_layer(cpi); cpi->ext_refresh_frame_flags_pending = 0; cpi->last_frame_dropped = 1; cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + if (cpi->svc.framedrop_mode == LAYER_DROP || + cpi->svc.drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } return; } @@ -4538,55 +4601,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, } } - // For 1 pass CBR, check if we are dropping this frame. - // Never drop on key frame, or if base layer is key for svc. - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && - cm->frame_type != KEY_FRAME && - (!cpi->use_svc || - !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) { - int svc_prev_layer_dropped = 0; - // In the contrained framedrop mode for svc (framedrop_mode = - // CONSTRAINED_LAYER_DROP), if the previous spatial layer was dropped, drop - // the current spatial layer. - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) - svc_prev_layer_dropped = 1; - if ((svc_prev_layer_dropped && - cpi->svc.framedrop_mode == CONSTRAINED_LAYER_DROP) || - vp9_rc_drop_frame(cpi)) { - vp9_rc_postencode_update_drop_frame(cpi); - cpi->ext_refresh_frame_flags_pending = 0; - cpi->last_frame_dropped = 1; - if (cpi->use_svc) { - cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; - vp9_inc_frame_in_layer(cpi); - cpi->svc.skip_enhancement_layer = 1; - if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { - int i; - int all_layers_drop = 1; - for (i = 0; i < cpi->svc.spatial_layer_id; i++) { - if (cpi->svc.drop_spatial_layer[i] == 0) { - all_layers_drop = 0; - break; - } - } - if (all_layers_drop == 1) cpi->svc.skip_enhancement_layer = 0; - } - } - return; - } - } - vpx_clear_system_state(); #if CONFIG_INTERNAL_STATS memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif +#if CONFIG_CONSISTENT_RECODE + // Backup to ensure consistency between recodes + save_encode_params(cpi); +#endif if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, size, dest); + if (!encode_without_recode_loop(cpi, size, dest)) return; } else { encode_with_recode_loop(cpi, size, dest); } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 1e0ed70fb..f66c13046 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -282,6 +282,9 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; +#if CONFIG_CONSISTENT_RECODE + int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; +#endif int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 453879fb8..c76dfd351 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -2201,8 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // Define middle frame mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; - normal_frames = - rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); + normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); else @@ -2441,9 +2440,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); // Monitor for static sections. - if ((rc->frames_since_key + i - 1) > 1) { - zero_motion_accumulator *= get_zero_motion_factor(cpi, &next_frame); - } + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -2464,18 +2462,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Break out conditions. - // Break at maximum of active_max_gf_interval unless almost totally static. - // - // Note that the addition of a test of rc->source_alt_ref_active is - // deliberate. The effect of this is that after a normal altref group even - // if the material is static there will be one normal length GF group - // before allowing longer GF groups. The reason for this is that in cases - // such as slide shows where slides are separated by a complex transition - // such as a fade, the arf group spanning the transition may not be coded - // at a very high quality and hence this frame (with its overlay) is a - // poor golden frame to use for an extended group. - if (((i >= active_max_gf_interval) && - ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) || + if ( + // Break at active_max_gf_interval unless almost totally static. + ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || ( // Don't break out with a very short interval. (i >= active_min_gf_interval) && @@ -2495,8 +2484,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if ((zero_motion_accumulator < 0.995) && allow_alt_ref && - (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { + if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && + (i >= rc->min_gf_interval)) { const int forward_frames = (rc->frames_to_key - i >= i - 1) ? i - 1 : VPXMAX(0, rc->frames_to_key - i); @@ -2523,11 +2512,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); #endif - rc->baseline_gf_interval = - ((twopass->kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) && - (i >= rc->frames_to_key)) - ? i - : (i - (is_key_frame || rc->source_alt_ref_pending)); + // Set the interval until the next gf. + rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -2774,7 +2760,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; - rc->frames_since_key = 0; // Reset the GF group data structures. vp9_zero(*gf_group); @@ -2919,22 +2904,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - // The zero motion test here insures that if we mark a kf group as static - // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. - // It also allows for a larger boost on long static groups. - if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) { + if (i <= KF_BOOST_SCAN_MAX_FRAMES) { double frame_boost; double zm_factor; // Monitor for static sections. - // First frame in kf group the second ref indicator is invalid. - if (i > 0) { - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - } else { - zero_motion_accumulator = - next_frame.pcnt_inter - next_frame.pcnt_motion; - } + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); @@ -2971,16 +2947,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Special case for static / slide show content but dont apply - // if the kf group is very short. - if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { - rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST); - } else { - // Apply various clamps for min and max boost - rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); - } + // Apply various clamps for min and max boost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index aa497e3da..000ecd779 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -120,12 +120,12 @@ typedef enum { typedef struct { unsigned char index; unsigned char first_inter_index; - RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1]; - FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; - int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1]; + RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; + FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; + int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; } GF_GROUP; typedef struct { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 1ba518af8..60d5c89b1 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -224,6 +224,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (rv && search_subpel) { int subpel_force_stop = cpi->sf.mv.subpel_force_stop; if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; + if (cpi->sf.mv.enable_adaptive_subpel_force_stop) { + int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh; + if (abs(tmp_mv->as_mv.row) >= mv_thresh || + abs(tmp_mv->as_mv.col) >= mv_thresh) + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above; + else + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below; + } cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, @@ -1421,7 +1429,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const SVC *const svc = &cpi->svc; + SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; @@ -1495,7 +1503,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #endif INTERP_FILTER filter_gf_svc = EIGHTTAP; MV_REFERENCE_FRAME best_second_ref_frame = NONE; - MV_REFERENCE_FRAME spatial_ref = GOLDEN_FRAME; + MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME; const struct segmentation *const seg = &cm->seg; int comp_modes = 0; int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; @@ -1504,25 +1512,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int svc_mv_row = 0; int no_scaling = 0; unsigned int thresh_svc_skip_golden = 500; - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id - 1, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (cpi->use_svc && svc->spatial_layer_id > 0) { + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; } - if (cpi->svc.spatial_layer_id > 0 && - (cpi->svc.high_source_sad_superframe || no_scaling)) + if (svc->spatial_layer_id > 0 && + (svc->high_source_sad_superframe || no_scaling)) thresh_svc_skip_golden = 0; // Lower the skip threshold if lower spatial layer is better quality relative // to current layer. - else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 && - cm->base_qindex > cpi->svc.lower_layer_qindex + 15) + else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > svc->lower_layer_qindex + 15) thresh_svc_skip_golden = 100; // Increase skip threshold if lower spatial layer is lower quality relative // to current layer. - else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 && - cm->base_qindex < cpi->svc.lower_layer_qindex - 20) + else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < svc->lower_layer_qindex - 20) thresh_svc_skip_golden = 1000; init_ref_frame_cost(cm, xd, ref_frame_cost); @@ -1585,10 +1593,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { if (cpi->use_svc) { - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame; } if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) @@ -1623,19 +1631,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && - cpi->svc.spatial_layer_id > 0) { + svc->spatial_layer_id > 0) { if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[LAST_FRAME - 1] = 1; - spatial_ref = LAST_FRAME; + inter_layer_ref = LAST_FRAME; } } if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; if (vp9_is_scaled(sf)) { svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; - spatial_ref = GOLDEN_FRAME; + inter_layer_ref = GOLDEN_FRAME; } } } @@ -1652,6 +1660,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad || + cpi->rc.avg_frame_low_motion < 60)) + usable_ref_frame = LAST_FRAME; + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1677,6 +1689,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame] = + !(cpi->ref_frame_flags & flag_list[ref_frame]); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, @@ -1692,9 +1708,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // an averaging filter for downsampling (phase = 8). If so, we will test // a nonzero motion mode on the spatial reference. // The nonzero motion is half pixel shifted to left and top (-4, -4). - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - svc_force_zero_mode[spatial_ref - 1] && - cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) { + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc_force_zero_mode[inter_layer_ref - 1] && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8) { svc_mv_col = -4; svc_mv_row = -4; flag_svc_subpel = 1; @@ -1713,7 +1729,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int inter_mv_mode = 0; int skip_this_mv = 0; int comp_pred = 0; - int force_gf_mv = 0; + int force_mv_inter_layer = 0; PREDICTION_MODE this_mode; second_ref_frame = NONE; @@ -1743,8 +1759,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) continue; - if (flag_svc_subpel && ref_frame == spatial_ref) { - force_gf_mv = 1; + if (flag_svc_subpel && ref_frame == inter_layer_ref) { + force_mv_inter_layer = 1; // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), // otherwise set NEWMV to (svc_mv_col, svc_mv_row). if (this_mode == NEWMV) { @@ -1771,8 +1787,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, sse_zeromv_normalized < thresh_svc_skip_golden) continue; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != NEARESTMV) { + frame_mv[this_mode][ref_frame].as_int != 0) { continue; } @@ -1802,14 +1820,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; - if (const_motion[ref_frame] && this_mode == NEARMV) continue; // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } @@ -1823,7 +1840,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (cpi->use_svc) { - if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] && + if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } @@ -1883,9 +1900,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->sf.adaptive_rd_thresh_row_mt && rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index]))) - continue; + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (this_mode == NEWMV && !force_gf_mv) { + if (this_mode == NEWMV && !force_mv_inter_layer) { if (ref_frame > LAST_FRAME && !cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { int tmp_sad; @@ -1931,7 +1948,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, // for SVC encoding. - if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 && + if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 && frame_mv[NEWMV][ref_frame].as_mv.row == 0 && frame_mv[NEWMV][ref_frame].as_mv.col == 0) continue; @@ -2028,7 +2045,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && !force_gf_mv && + (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { int pf_rate[3]; @@ -2254,12 +2271,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // layer is chosen as the reference. Always perform intra prediction if // LAST is the only reference, or is_key_frame is set, or on base // temporal layer. - if (cpi->svc.spatial_layer_id) { + if (svc->spatial_layer_id) { perform_intra_pred = - cpi->svc.temporal_layer_id == 0 || - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame || + svc->temporal_layer_id == 0 || + svc->layer_context[svc->temporal_layer_id].is_key_frame || !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || - (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + (!svc->layer_context[svc->temporal_layer_id].is_key_frame && svc_force_zero_mode[best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 599337f80..11547fb2e 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -273,6 +273,14 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { const VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + // On dropped frame, don't update buffer if its currently stable + // (above optimal level). This can cause issues when full superframe + // can drop (!= LAYER_DROP), since QP is adjusted downwards with buffer + // overflow, which can cause more frame drops. + if (cpi->svc.framedrop_mode != LAYER_DROP && encoded_frame_size == 0 && + rc->buffer_level > rc->optimal_buffer_level) + return; + // Non-viewable frames are a special case and are treated as pure overhead. if (!cm->show_frame) { rc->bits_off_target -= encoded_frame_size; @@ -390,7 +398,31 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; } -static int check_buffer(VP9_COMP *cpi, int drop_mark) { +static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level > drop_mark); + } else { + int i; + // For SVC in the FULL_SUPERFRAME_DROP): the condition on + // buffer (if its above threshold, so no drop) is checked on current and + // upper spatial layers. If any spatial layer is not above threshold then + // we return 0. + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + const int drop_mark_layer = + (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } + return 1; + } +} + +static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { SVC *svc = &cpi->svc; if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) { RATE_CONTROL *const rc = &cpi->rc; @@ -398,8 +430,10 @@ static int check_buffer(VP9_COMP *cpi, int drop_mark) { } else { int i; // For SVC in the constrained framedrop mode (svc->framedrop_mode = - // CONSTRAINED_LAYER_DROP): the condition on buffer (to drop frame) is - // checked on current and upper spatial layers. + // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on + // buffer (if its below threshold, so drop frame) is checked on current + // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any + // spatial layer is <= threshold, then we return 1 (drop). for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, svc->number_temporal_layers); @@ -407,23 +441,42 @@ static int check_buffer(VP9_COMP *cpi, int drop_mark) { RATE_CONTROL *lrc = &lc->rc; const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100); - if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } } - return 1; + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) + return 0; + else + return 1; } } -int vp9_rc_drop_frame(VP9_COMP *cpi) { +static int drop_frame(VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + SVC *svc = &cpi->svc; int drop_frames_water_mark = oxcf->drop_frames_water_mark; - if (cpi->use_svc) - drop_frames_water_mark = - cpi->svc.framedrop_thresh[cpi->svc.spatial_layer_id]; - if (!drop_frames_water_mark) { + if (cpi->use_svc) { + // If we have dropped max_consec_drop frames, then we don't + // drop this spatial layer, and reset counter to 0. + if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) { + svc->drop_count[svc->spatial_layer_id] = 0; + return 0; + } else { + drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id]; + } + } + if (!drop_frames_water_mark || + (svc->spatial_layer_id > 0 && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { return 0; } else { - if (rc->buffer_level < 0) { + if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) || + (check_buffer_below_thresh(cpi, -1) && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { // Always drop if buffer is below 0. return 1; } else { @@ -431,9 +484,11 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { // (starting with the next frame) until it increases back over drop_mark. int drop_mark = (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100); - if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + if (check_buffer_above_thresh(cpi, drop_mark) && + (rc->decimation_factor > 0)) { --rc->decimation_factor; - } else if (check_buffer(cpi, drop_mark) && rc->decimation_factor == 0) { + } else if (check_buffer_below_thresh(cpi, drop_mark) && + rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -452,6 +507,75 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { } } +int vp9_rc_drop_frame(VP9_COMP *cpi) { + SVC *svc = &cpi->svc; + int svc_prev_layer_dropped = 0; + // In the constrained or full_superframe framedrop mode for svc + // (framedrop_mode != LAYER_DROP), if the previous spatial layer was + // dropped, drop the current spatial layer. + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) + svc_prev_layer_dropped = 1; + if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP) || + drop_frame(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + if (cpi->use_svc) { + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + if (svc->framedrop_mode == LAYER_DROP || + svc->drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + int all_layers_drop = 1; + for (i = 0; i < svc->spatial_layer_id; i++) { + if (svc->drop_spatial_layer[i] == 0) { + all_layers_drop = 0; + break; + } + } + if (all_layers_drop == 1) svc->skip_enhancement_layer = 0; + } + } + return 1; + } + return 0; +} + +static int adjust_q_cbr(const VP9_COMP *cpi, int q) { + // This makes sure q is between oscillating Qs to prevent resonance. + if (!cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + // If the previous frame had overshoot and the current q needs to increase + // above the clamped value, reduce the clamp for faster reaction to + // overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + vp9_cyclic_refresh_limit_q(cpi, &q); + return q; +} + static double get_rate_correction_factor(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; double rcf; @@ -610,22 +734,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, } } while (++i <= active_worst_quality); - // In CBR mode, this makes sure q is between oscillating Qs to prevent - // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad && - (!cpi->oxcf.gf_cbr_boost_pct || - !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && - (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && - cpi->rc.q_1_frame != cpi->rc.q_2_frame) { - int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), - VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); - // If the previous had overshoot and the current q needs to increase above - // the clamped value, reduce the clamp for faster reaction to overshoot. - if (cpi->rc.rc_1_frame == -1 && q > qclamp) - q = (q + qclamp) >> 1; - else - q = qclamp; - } + // Adjustment to q for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q); + return q; } @@ -730,8 +841,10 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2); if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. - // Maximum limit for down adjustment, ~30%. + // Maximum limit for down adjustment ~30%; make it lower for screen content. int max_adjustment_down = active_worst_quality / 3; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + max_adjustment_down = active_worst_quality >> 3; if (max_adjustment_down) { buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / max_adjustment_down); @@ -1118,9 +1231,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, // Baseline value derived from cpi->active_worst_quality and kf boost. active_best_quality = get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); - if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { - active_best_quality /= 4; - } // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -1500,7 +1610,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) { + if (cm->frame_type != KEY_FRAME && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) { compute_frame_low_motion(cpi); if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); } @@ -1867,8 +1981,13 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( cpi->framerate, rc->min_gf_interval); - // Extended max interval for genuinely static scenes like slide shows. - rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + + if (is_altref_enabled(cpi)) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; @@ -2426,6 +2545,19 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) rc->this_frame_target = rc->avg_frame_bandwidth; } + // For SVC the new (updated) avg_source_sad[0] for the current superframe + // updates the setting for all layers. + if (cpi->use_svc) { + int sl, tl; + SVC *const svc = &cpi->svc; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_source_sad[0] = rc->avg_source_sad[0]; + } + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2459,8 +2591,11 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int thresh_qp = 3 * (rc->worst_quality >> 2); - int thresh_rate = rc->avg_frame_bandwidth * 10; + int thresh_qp = 7 * (rc->worst_quality >> 3); + int thresh_rate = rc->avg_frame_bandwidth << 3; + // Lower rate threshold for video. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + thresh_rate = rc->avg_frame_bandwidth << 2; if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) { double rate_correction_factor = cpi->rc.rate_correction_factors[INTER_NORMAL]; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 3a40e0138..c1b210677 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -34,14 +34,6 @@ extern "C" { #define FRAME_OVERHEAD_BITS 200 -// Threshold used to define a KF group as static (e.g. a slide show). -// Essentially this means that no frame in the group has more than 1% of MBs -// that are not marked as coded with 0,0 motion in the first pass. -#define STATIC_KF_GROUP_THRESH 99 - -// The maximum duration of a GF group that is static (for example a slide show). -#define MAX_STATIC_GF_GROUP_LENGTH 250 - typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 59022c106..8201bba70 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -108,7 +108,11 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#if CONFIG_CONSISTENT_RECODE + int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#endif int RDMULT; int RDDIV; } RD_OPT; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e39df033a..e3672edf5 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -847,7 +847,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, { INT64_MAX, INT64_MAX } }; int n; int s0, s1; - int64_t best_rd = INT64_MAX; + int64_t best_rd = ref_best_rd; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; const int tx_size_ctx = get_tx_size_context(xd); @@ -868,8 +868,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (n = start_tx; n >= end_tx; n--) { const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; - txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, - bs, n, cpi->sf.use_fast_coef_costing); + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { r[n][1] += r_tx_size; @@ -3612,9 +3612,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { - // If adaptive interp filter is enabled, then the current leaf node of 8x8 - // data is needed for sub8x8. Hence preserve the context. +// If adaptive interp filter is enabled, then the current leaf node of 8x8 +// data is needed for sub8x8. Hence preserve the context. +#if CONFIG_CONSISTENT_RECODE + if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#else if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 90da68726..d2842697d 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -375,6 +375,8 @@ static void set_rt_speed_feature_framesize_independent( sf->nonrd_keyframe = 0; sf->svc_use_lowres_part = 0; sf->re_encode_overshoot_rt = 0; + sf->disable_16x16part_nonkey = 0; + sf->disable_golden_ref = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -537,8 +539,14 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) sf->nonrd_keyframe = 1; if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && - cpi->oxcf.content == VP9E_CONTENT_SCREEN) + (cpi->use_svc || cpi->oxcf.content == VP9E_CONTENT_SCREEN)) { sf->re_encode_overshoot_rt = 1; + } + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cm->width <= 1280 && cm->height <= 720) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } } if (speed >= 6) { @@ -661,6 +669,21 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; } + + if (speed >= 9) { + sf->mv.enable_adaptive_subpel_force_stop = 1; + sf->mv.adapt_subpel_force_stop.mv_thresh = 2; + if (cpi->rc.avg_frame_low_motion < 40) + sf->mv.adapt_subpel_force_stop.mv_thresh = 1; + sf->mv.adapt_subpel_force_stop.force_stop_below = 1; + sf->mv.adapt_subpel_force_stop.force_stop_above = 2; + // Disable partition blocks below 16x16, except for low-resolutions. + if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240) + sf->disable_16x16part_nonkey = 1; + // Allow for disabling GOLDEN reference, for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1; + } + if (sf->use_altref_onepass) { if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { sf->partition_search_type = FIXED_PARTITION; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 946bf0545..251cfdbcd 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -161,6 +161,17 @@ typedef enum { ONE_LOOP_REDUCED = 1 } FAST_COEFF_UPDATE; +typedef struct ADAPT_SUBPEL_FORCE_STOP { + // Threshold for full pixel motion vector; + int mv_thresh; + + // subpel_force_stop if full pixel MV is below the threshold. + int force_stop_below; + + // subpel_force_stop if full pixel MV is equal to or above the threshold. + int force_stop_above; +} ADAPT_SUBPEL_FORCE_STOP; + typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -189,6 +200,11 @@ typedef struct MV_SPEED_FEATURES { // 3: Stop at full pixel. int subpel_force_stop; + // If it's enabled, different subpel_force_stop will be used for different MV. + int enable_adaptive_subpel_force_stop; + + ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop; + // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; } MV_SPEED_FEATURES; @@ -515,6 +531,12 @@ typedef struct SPEED_FEATURES { // Enable re-encoding on scene change with potential high overshoot, // for real-time encoding flow. int re_encode_overshoot_rt; + + // Disable partitioning of 16x16 blocks. + int disable_16x16part_nonkey; + + // Allow for disabling golden reference. + int disable_golden_ref; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 07d1995a8..d745ae0df 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -41,7 +41,10 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON; svc->framedrop_mode = CONSTRAINED_LAYER_DROP; - for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; + for (i = 0; i < REF_FRAMES; ++i) { + svc->fb_idx_spatial_layer_id[i] = -1; + svc->fb_idx_temporal_layer_id[i] = -1; + } for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { svc->last_layer_dropped[sl] = 0; svc->drop_spatial_layer[sl] = 0; @@ -52,7 +55,10 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; + svc->fb_idx_upd_tl0[sl] = -1; + svc->drop_count[sl] = 0; } + svc->max_consec_drop = INT_MAX; if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH, @@ -787,7 +793,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = 0; if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && - cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id]) { + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] && + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] != -1 && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) { // For fixed/non-flexible mode, if the previous frame (same spatial layer // from previous superframe) was dropped, make sure the lst_fb_idx // for this frame corresponds to the buffer index updated on (last) encoded @@ -903,12 +911,11 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { } } } - // Check for disabling inter-layer prediction if - // INTER_LAYER_PRED_ON_CONSTRAINED is enabled. - // If the reference for inter-layer prediction (the reference that is scaled) - // is not the previous spatial layer from the same superframe, then we - // disable inter-layer prediction. - if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_ON_CONSTRAINED) { + // Check for disabling inter-layer prediction if the reference for inter-layer + // prediction (the reference that is scaled) is not the previous spatial layer + // from the same superframe, then we disable inter-layer prediction. + // Only need to check when inter_layer prediction is not set to OFF mode. + if (cpi->svc.disable_inter_layer_pred != INTER_LAYER_PRED_OFF) { // We only use LAST and GOLDEN for prediction in real-time mode, so we // check both here. MV_REFERENCE_FRAME ref_frame; @@ -940,3 +947,46 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { } } } + +void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // For fixed/non-flexible mode, the folllowing constraint are expected, + // when inter-layer prediciton is on (default). + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && + svc->framedrop_mode != LAYER_DROP) { + if (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) { + // On non-key frames: LAST is always temporal reference, GOLDEN is + // spatial reference. + if (svc->temporal_layer_id == 0) + // Base temporal only predicts from base temporal. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0); + else + // Non-base temporal only predicts from lower temporal layer. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] < + svc->temporal_layer_id); + if (svc->spatial_layer_id > 0) { + // Non-base spatial only predicts from lower spatial layer with same + // temporal_id. + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } else if (svc->spatial_layer_id > 0) { + // Only 1 reference for frame whose base is key; reference may be LAST + // or GOLDEN, so we check both. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == + svc->temporal_layer_id); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } + } +} diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 617717049..9be5bb7ea 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -96,7 +96,6 @@ typedef struct SVC { int lst_fb_idx[VPX_MAX_LAYERS]; int gld_fb_idx[VPX_MAX_LAYERS]; int alt_fb_idx[VPX_MAX_LAYERS]; - int ref_frame_index[REF_FRAMES]; int force_zero_mode_spatial_ref; int current_superframe; int non_reference_frame; @@ -122,6 +121,8 @@ typedef struct SVC { int last_layer_dropped[VPX_MAX_LAYERS]; int drop_spatial_layer[VPX_MAX_LAYERS]; int framedrop_thresh[VPX_MAX_LAYERS]; + int drop_count[VPX_MAX_LAYERS]; + int max_consec_drop; SVC_LAYER_DROP_MODE framedrop_mode; INTER_LAYER_PRED disable_inter_layer_pred; @@ -141,7 +142,12 @@ typedef struct SVC { // Keep track of the frame buffer index updated/refreshed on the base // temporal superframe. - uint8_t fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + + // Keep track of the spatial and temporal layer id of the frame that last + // updated the frame buffer index. + uint8_t fb_idx_spatial_layer_id[REF_FRAMES]; + uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; } SVC; struct VP9_COMP; @@ -201,6 +207,8 @@ void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); +void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 5eaa7a18a..2758314fb 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -248,7 +248,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, row_mt, 0, 1); RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); - RANGE_CHECK(extra_cfg, cpu_used, -8, 8); + RANGE_CHECK(extra_cfg, cpu_used, -9, 9); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, tile_columns, 0, 6); RANGE_CHECK(extra_cfg, tile_rows, 0, 2); @@ -709,6 +709,8 @@ static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args); + // TODO(jianj): Look into issue of noise estimation with high bitdepth. + if (ctx->cfg.g_bit_depth > 8) extra_cfg.noise_sensitivity = 0; return update_extra_cfg(ctx, &extra_cfg); } @@ -1536,6 +1538,8 @@ static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx, cpi->svc.framedrop_mode = data->framedrop_mode; for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl]; + // Don't allow max_consec_drop values below 1. + cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop); return VPX_CODEC_OK; } diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index b201d96f4..44519e063 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -786,7 +786,8 @@ typedef struct vpx_svc_ref_frame_config { typedef enum { CONSTRAINED_LAYER_DROP, /**< Upper layers are constrained to drop if current layer drops. */ - LAYER_DROP, /**< Any spatial layer can drop. */ + LAYER_DROP, /**< Any spatial layer can drop. */ + FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */ } SVC_LAYER_DROP_MODE; /*!\brief vp9 svc frame dropping parameters. @@ -799,7 +800,8 @@ typedef enum { typedef struct vpx_svc_frame_drop { int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */ SVC_LAYER_DROP_MODE - framedrop_mode; /**< Layer-based or constrained dropping. */ + framedrop_mode; /**< Layer-based or constrained dropping. */ + int max_consec_drop; /**< Maximum consecutive drops, for any layer. */ } vpx_svc_frame_drop_t; /*!\cond */ diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c index 1370ec2d2..5afdece0a 100644 --- a/vpx_dsp/arm/avg_pred_neon.c +++ b/vpx_dsp/arm/avg_pred_neon.c @@ -17,8 +17,8 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { if (width > 8) { - int x, y; - for (y = 0; y < height; ++y) { + int x, y = height; + do { for (x = 0; x < width; x += 16) { const uint8x16_t p = vld1q_u8(pred + x); const uint8x16_t r = vld1q_u8(ref + x); @@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, comp += width; pred += width; ref += ref_stride; - } + } while (--y); + } else if (width == 8) { + int i = width * height; + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); } else { - int i; - for (i = 0; i < width * height; i += 16) { + int i = width * height; + assert(width == 4); + do { const uint8x16_t p = vld1q_u8(pred); uint8x16_t r; - if (width == 4) { - r = load_unaligned_u8q(ref, ref_stride); - ref += 4 * ref_stride; - } else { - const uint8x8_t r_0 = vld1_u8(ref); - const uint8x8_t r_1 = vld1_u8(ref + ref_stride); - assert(width == 8); - r = vcombine_u8(r_0, r_1); - ref += 2 * ref_stride; - } + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; r = vrhaddq_u8(r, p); vst1q_u8(comp, r); pred += 16; comp += 16; - } + i -= 16; + } while (i); } } diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c index ce81fb630..eef123368 100644 --- a/vpx_dsp/arm/subtract_neon.c +++ b/vpx_dsp/arm/subtract_neon.c @@ -9,71 +9,72 @@ */ #include <arm_neon.h> +#include <assert.h> #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; + int r = rows, c; if (cols > 16) { - for (r = 0; r < rows; ++r) { + do { for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = - vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = - vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = - vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = - vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + const uint8x16_t s0 = vld1q_u8(&src[c + 0]); + const uint8x16_t s1 = vld1q_u8(&src[c + 16]); + const uint8x16_t p0 = vld1q_u8(&pred[c + 0]); + const uint8x16_t p1 = vld1q_u8(&pred[c + 16]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0)); + const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1)); + const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3)); } diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = - vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = - vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + do { + const uint8x16_t s = vld1q_u8(&src[0]); + const uint8x16_t p = vld1q_u8(&pred[0]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + do { + const uint8x8_t s = vld1_u8(&src[0]); + const uint8x8_t p = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(s, p); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } + assert(cols == 4); + do { + const uint8x8_t s = load_unaligned_u8(src, (int)src_stride); + const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride); + const uint16x8_t d = vsubl_u8(s, p); + vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d))); + vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d))); + diff += 2 * diff_stride; + pred += 2 * pred_stride; + src += 2 * src_stride; + r -= 2; + } while (r); } } diff --git a/vpx_dsp/ppc/deblock_vsx.c b/vpx_dsp/ppc/deblock_vsx.c new file mode 100644 index 000000000..4329081ee --- /dev/null +++ b/vpx_dsp/ppc/deblock_vsx.c @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +extern const int16_t vpx_rv[]; + +static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, + 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static INLINE uint8x16_t vec_abd_s8(uint8x16_t a, uint8x16_t b) { + return vec_sub(vec_max(a, b), vec_min(a, b)); +} + +static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v, + uint8x16_t filter) { + const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]); + const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]); + const uint8x16_t k3 = vec_avg(k1, k2); + const uint8x16_t f_a = vec_max(vec_abd_s8(v, ctx[0]), vec_abd_s8(v, ctx[1])); + const uint8x16_t f_b = vec_max(vec_abd_s8(v, ctx[2]), vec_abd_s8(v, ctx[3])); + const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter); + return vec_sel(v, vec_avg(k3, v), mask); +} + +static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src, + int stride) { + ctx[0] = vec_vsx_ld(col - 2 * stride, src); + ctx[1] = vec_vsx_ld(col - stride, src); + ctx[2] = vec_vsx_ld(col + stride, src); + ctx[3] = vec_vsx_ld(col + 2 * stride, src); +} + +static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx, + uint8x16_t v, uint8x16_t right_ctx) { + static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1A, 0x1B, 0x1C, 0x1D }; + + static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, + 0x1B, 0x1C, 0x1D, 0x1E }; + + static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10 }; + + static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11 }; + ctx[0] = vec_perm(left_ctx, v, l2_perm); + ctx[1] = vec_perm(left_ctx, v, l1_perm); + ctx[2] = vec_perm(v, right_ctx, r1_perm); + ctx[3] = vec_perm(v, right_ctx, r2_perm); +} +void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + int row, col; + uint8x16_t ctx[4], out, v, left_ctx; + + for (row = 0; row < size; row++) { + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + /* now post_proc_across */ + left_ctx = vec_splats(dst_ptr[0]); + v = vec_vsx_ld(0, dst_ptr); + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = (col + 16 == cols) + ? vec_splats(dst_ptr[cols - 1]) + : vec_vsx_ld(col, dst_ptr + 16); + horz_ctx(ctx, left_ctx, v, right_ctx); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + left_ctx = v; + v = right_ctx; + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]); + horz_ctx(ctx, left_ctx, v, right_ctx); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +// C: s[c + 7] +static INLINE int16x8_t next7l_s16(uint8x16_t c) { + static const uint8x16_t next7_perm = { + 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13, + 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17, + }; + return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm); +} + +// Slide across window and add. +static INLINE int16x8_t slide_sum_s16(int16x8_t x) { + // x = A B C D E F G H + // + // 0 A B C D E F G + const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3)))); + // 0 0 A B C D E F + const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))), + // 0 0 0 A B C D E + vec_slo(x, vec_splats((int8_t)(6 << 3)))); + // 0 0 0 0 A B C D + const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))), + // 0 0 0 0 0 A B C + vec_slo(x, vec_splats((int8_t)(10 << 3)))); + // 0 0 0 0 0 0 A B + const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))), + // 0 0 0 0 0 0 0 A + vec_slo(x, vec_splats((int8_t)(14 << 3)))); + return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4)); +} + +// Slide across window and add. +static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) { + // 0 A C E + // + 0 B D F + int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3)))); + // 0 0 A C + // + 0 0 B D + int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3)))); + // 0 0 0 A + // + 0 0 0 B + int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3)))); + sumsq_1 = vec_add(sumsq_1, xsq_even); + sumsq_2 = vec_add(sumsq_2, sumsq_3); + return vec_add(sumsq_1, sumsq_2); +} + +// C: (b + sum + val) >> 4 +static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) { + return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4)); +} + +// C: sumsq * 15 - sum * sum +static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd, + int16x8_t sum, int32x4_t lim) { + static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, + 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + const int32x4_t sumsq_odd_scaled = + vec_mul(sumsq_odd, vec_splats((int32_t)15)); + const int32x4_t sumsq_even_scaled = + vec_mul(sumsq_even, vec_splats((int32_t)15)); + const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum)); + const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum)); + + const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim); + const bool32x4_t mask_even = vec_cmplt(thres_even, lim); + return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge); +} + +void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int row, col; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + + for (row = 0; row < rows; row++) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + // Fill left context with first col. + int16x8_t left_ctx = vec_splats((int16_t)src[0]); + int16_t s = src[0] * 9; + int32_t ssq = src[0] * src[0] * 9 + 16; + + // Fill the next 6 columns of the sliding window with cols 2 to 7. + for (col = 1; col <= 6; ++col) { + s += src[col]; + ssq += src[col] * src[col]; + } + // Set this sum to every element in the window. + sum = vec_splats(s); + sumsq_even = vec_splats(ssq); + sumsq_odd = vec_splats(ssq); + + for (col = 0; col < cols; col += 8) { + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const uint8x16_t val = vec_vsx_ld(0, src + col); + const int16x8_t val_high = unpack_to_s16_h(val); + + // C: s[c + 7] + const int16x8_t right_ctx = (col + 8 == cols) + ? vec_splats((int16_t)src[col + 7]) + : next7l_s16(val); + + // C: x = s[c + 7] - s[c - 8]; + const int16x8_t x = vec_sub(right_ctx, left_ctx); + const int32x4_t xsq_even = + vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx)); + const int32x4_t xsq_odd = + vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx)); + + const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd); + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_even = vec_add(sumsq_even, sumsq_tmp); + // B D F G + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd)); + + sum = vec_add(sum, slide_sum_s16(x)); + + // C: (8 + sum + s[c]) >> 4 + filtered = filter_s16(vec_splats((int16_t)8), sum, val_high); + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(val_high, filtered, mask); + + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge); + vec_vsx_st(out, 0, src + col); + + // Update window sum and square sum + sum = vec_splat(sum, 7); + sumsq_even = vec_splat(sumsq_odd, 3); + sumsq_odd = vec_splat(sumsq_odd, 3); + + // C: s[c - 8] (for next iteration) + left_ctx = val_high; + } + src += pitch; + } +} + +void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int col, row, i; + int16x8_t window[16]; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + // If rows is less than 8 the bottom border extension fails. + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t r1, sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + r1 = unpack_to_s16_h(vec_vsx_ld(0, dst)); + // Fill sliding window with first row. + for (i = 0; i <= 8; i++) { + window[i] = r1; + } + // First 9 rows of the sliding window are the same. + // sum = r1 * 9 + sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16); + + // sumsq = r1 * r1 * 9 + sumsq_even = vec_mule(sum, r1); + sumsq_odd = vec_mulo(sum, r1); + + // Fill the next 6 rows of the sliding window with rows 2 to 7. + for (i = 1; i <= 6; ++i) { + const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst)); + window[i + 8] = next_row; + sum = vec_add(sum, next_row); + sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row)); + sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row)); + } + + for (row = 0; row < rows; row++) { + int32x4_t d15_even, d15_odd, d0_even, d0_odd; + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127)); + + // Move the sliding window + if (row + 7 < rows) { + window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst)); + } else { + window[15] = window[14]; + } + + // C: sum += s[7 * pitch] - s[-8 * pitch]; + sum = vec_add(sum, vec_sub(window[15], window[0])); + + // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * + // pitch]; + // Optimization Note: Caching a squared-window for odd and even is + // slower than just repeating the multiplies. + d15_odd = vec_mulo(window[15], window[15]); + d15_even = vec_mule(window[15], window[15]); + d0_odd = vec_mulo(window[0], window[0]); + d0_even = vec_mule(window[0], window[0]); + sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd)); + sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even)); + + // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4 + filtered = filter_s16(rv, sum, window[8]); + + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(window[8], filtered, mask); + + // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per + // iteration + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch), + load_merge); + vec_vsx_st(out, 0, dst + row * pitch); + + // Optimization Note: Turns out that the following loop is faster than + // using pointers to manage the sliding window. + for (i = 1; i < 16; i++) { + window[i - 1] = window[i]; + } + } + dst += 8; + } +} diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c index f095cb0a4..6603b85ac 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/vpx_dsp/ppc/inv_txfm_vsx.c @@ -76,6 +76,8 @@ static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; +static uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; #define ROUND_SHIFT_INIT \ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ const uint32x4_t shift14 = vec_splat_u32(14); @@ -107,6 +109,15 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; out1 = vec_sub(step0, step1); \ out1 = vec_perm(out1, out1, mask0); +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; @@ -114,13 +125,10 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; - uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; int16x8_t v0 = load_tran_low(0, input); int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); int16x8_t t0 = vec_mergeh(v0, v1); int16x8_t t1 = vec_mergel(v0, v1); - uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -130,6 +138,7 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + uint8x16_t output_v; uint8_t tmp_dest[16]; ROUND_SHIFT_INIT @@ -148,13 +157,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, PIXEL_ADD4(v0, t_out0); PIXEL_ADD4(v1, t_out1); - tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); - tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); - output_v = vec_packsu(tmp16_0, tmp16_1); - vec_vsx_st(output_v, 0, tmp_dest); - for (i = 0; i < 4; i++) - for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + PACK_STORE(v0, v1); } #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ @@ -1062,3 +1066,67 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, ADD_STORE_BLOCK(src2, 16); ADD_STORE_BLOCK(src3, 24); } + +#define TRANSFORM_COLS \ + v32_a = vec_add(v32_a, v32_c); \ + v32_d = vec_sub(v32_d, v32_b); \ + v32_e = vec_sub(v32_a, v32_d); \ + v32_e = vec_sra(v32_e, one); \ + v32_b = vec_sub(v32_e, v32_b); \ + v32_c = vec_sub(v32_e, v32_c); \ + v32_a = vec_sub(v32_a, v32_b); \ + v32_d = vec_add(v32_d, v32_c); \ + v_a = vec_packs(v32_a, v32_b); \ + v_c = vec_packs(v32_c, v32_d); + +#define TRANSPOSE_WHT \ + tmp_a = vec_mergeh(v_a, v_c); \ + tmp_c = vec_mergel(v_a, v_c); \ + v_a = vec_mergeh(tmp_a, tmp_c); \ + v_c = vec_mergel(tmp_a, tmp_c); + +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t v_a = load_tran_low(0, input); + int16x8_t v_c = load_tran_low(8 * sizeof(*input), input); + int16x8_t tmp_a, tmp_c; + uint16x8_t two = vec_splat_u16(2); + uint32x4_t one = vec_splat_u32(1); + int16x8_t tmp16_0, tmp16_1; + int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0); + int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1); + int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2); + int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + int i, j; + + v_a = vec_sra(v_a, two); + v_c = vec_sra(v_c, two); + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + PACK_STORE(v_a, v_c); +} diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index e037f89e3..3a9092f64 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -20,31 +20,70 @@ static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { return vec_xor(vec_add(a, mask), mask); } +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + // Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit // integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right // shift. - return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_s16); + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); } +// Quantization function used for 4x4, 8x8 and 16x16 blocks. static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, int16x8_t round, int16x8_t quant, int16x8_t quant_shift, bool16x8_t mask) { - int16x8_t rounded, qcoeff; - rounded = vec_vaddshs(coeff_abs, round); - qcoeff = vec_mulhi(rounded, quant); + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); qcoeff = vec_add(qcoeff, rounded); qcoeff = vec_mulhi(qcoeff, quant_shift); qcoeff = vec_sign(qcoeff, coeff); return vec_and(qcoeff, mask); } +// Quantization function used for 32x32 blocks. +static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, + bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + // 32x32 blocks require an extra multiplication by 2, this compensates for the + // extra right shift added in vec_mulhi, as such vec_madds can be used + // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15) + qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int16x8_t dqcoeff; + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + dqcoeff = vec_pack(dqcoeffe, dqcoeffo); + return vec_perm(dqcoeff, dqcoeff, vec_perm_merge); +} + static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, - const int16_t *iscan_ptr) { - bool16x8_t zero_coeff; - int16x8_t scan = vec_vsx_ld(0, iscan_ptr); - zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + const int16_t *iscan_ptr, int index) { + int16x8_t scan = vec_vsx_ld(index, iscan_ptr); + bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -64,7 +103,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - int16x8_t qcoeff, dqcoeff, eob; + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; // First set of 8 coeff starts with DC + 7 AC int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); @@ -73,51 +113,194 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); - int16x8_t coeff = vec_vsx_ld(0, coeff_ptr); - int16x8_t coeff_abs = vec_abs(coeff); - bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); (void)scan_ptr; (void)skip_block; assert(!skip_block); - qcoeff = - quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); - vec_vsx_st(qcoeff, 0, qcoeff_ptr); - - dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); - vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); - - eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr); - - // All other sets of 8 coeffs will only contain AC - zbin = vec_splat(zbin, 1); + qcoeff0 = + quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); round = vec_splat(round, 1); quant = vec_splat(quant, 1); - dequant = vec_splat(dequant, 1); quant_shift = vec_splat(quant_shift, 1); + qcoeff1 = + quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + + if (n_coeffs > 16) { + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2; + bool16x8_t zero_mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = + vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} + +void vpx_quantize_b_32x32_vsx( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, const int16_t *iscan_ptr) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + (void)scan_ptr; + (void)skip_block; + (void)n_coeffs; + assert(!skip_block); + + // 32x32 quantization requires that zbin and round be divided by 2 + zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); // remove DC from zbin + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + round = vec_splat(round, 1); // remove DC from round + quant = vec_splat(quant, 1); // remove DC from quant + quant_shift = vec_splat(quant_shift, 1); // remove DC from quant_shift + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); // remove DC from dequant + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); - n_coeffs -= 8; do { - coeff_ptr += 8; - qcoeff_ptr += 8; - dqcoeff_ptr += 8; - iscan_ptr += 8; + int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; + bool16x8_t zero_mask2; + + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); - coeff = vec_vsx_ld(0, coeff_ptr); - coeff_abs = vec_abs(coeff); - zero_mask = vec_cmpge(coeff_abs, zbin); - qcoeff = - quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); - vec_vsx_st(qcoeff, 0, qcoeff_ptr); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); - dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); - vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); - eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); - n_coeffs -= 8; - } while (n_coeffs > 0); + // 24 int16_t is 48 bytes + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); eob = vec_max_across(eob); *eob_ptr = eob[0]; diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index e2af55463..c6c7ce9f1 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -19,7 +19,9 @@ typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; typedef vector unsigned int uint32x4_t; +typedef vector bool char bool8x16_t; typedef vector bool short bool16x8_t; +typedef vector bool int bool32x4_t; #ifdef __clang__ static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, @@ -66,9 +68,15 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, #endif #endif +static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; -static const uint16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 }; +static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 }; static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; +static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 }; static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; @@ -79,4 +87,8 @@ static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, 0x0E, 0x0F, 0x00, 0x01 }; +static const uint8x16_t vec_perm_merge = { 0x00, 0x01, 0x08, 0x09, 0x02, 0x03, + 0x0A, 0x0B, 0x04, 0x05, 0x0C, 0x0D, + 0x06, 0x07, 0x0E, 0x0F }; + #endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c index 1efe2f005..d3f257b63 100644 --- a/vpx_dsp/ppc/variance_vsx.c +++ b/vpx_dsp/ppc/variance_vsx.c @@ -10,10 +10,11 @@ #include <assert.h> +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/ppc/types_vsx.h" -static inline uint8x16_t read4x2(const uint8_t *a, int stride) { +static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); @@ -101,3 +102,174 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, } } } + +static INLINE void variance_inner_32(const uint8_t *a, const uint8_t *b, + int32x4_t *sum_squared, int32x4_t *sum) { + int32x4_t s = *sum; + int32x4_t ss = *sum_squared; + + const uint8x16_t va0 = vec_vsx_ld(0, a); + const uint8x16_t vb0 = vec_vsx_ld(0, b); + const uint8x16_t va1 = vec_vsx_ld(16, a); + const uint8x16_t vb1 = vec_vsx_ld(16, b); + + const int16x8_t a0 = unpack_to_s16_h(va0); + const int16x8_t b0 = unpack_to_s16_h(vb0); + const int16x8_t a1 = unpack_to_s16_l(va0); + const int16x8_t b1 = unpack_to_s16_l(vb0); + const int16x8_t a2 = unpack_to_s16_h(va1); + const int16x8_t b2 = unpack_to_s16_h(vb1); + const int16x8_t a3 = unpack_to_s16_l(va1); + const int16x8_t b3 = unpack_to_s16_l(vb1); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int16x8_t d2 = vec_sub(a2, b2); + const int16x8_t d3 = vec_sub(a3, b3); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + s = vec_sum4s(d2, s); + ss = vec_msum(d2, d2, ss); + s = vec_sum4s(d3, s); + ss = vec_msum(d3, d3, ss); + *sum = s; + *sum_squared = ss; +} + +static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { + int i; + + int32x4_t s = vec_splat_s32(0); + int32x4_t ss = vec_splat_s32(0); + + switch (w) { + case 4: + for (i = 0; i < h / 2; ++i) { + const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); + const int16x8_t d = vec_sub(a0, b0); + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + a += a_stride * 2; + b += b_stride * 2; + } + break; + case 8: + for (i = 0; i < h; ++i) { + const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, a)); + const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, b)); + const int16x8_t d = vec_sub(a0, b0); + + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + a += a_stride; + b += b_stride; + } + break; + case 16: + for (i = 0; i < h; ++i) { + const uint8x16_t va = vec_vsx_ld(0, a); + const uint8x16_t vb = vec_vsx_ld(0, b); + const int16x8_t a0 = unpack_to_s16_h(va); + const int16x8_t b0 = unpack_to_s16_h(vb); + const int16x8_t a1 = unpack_to_s16_l(va); + const int16x8_t b1 = unpack_to_s16_l(vb); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + + a += a_stride; + b += b_stride; + } + break; + case 32: + for (i = 0; i < h; ++i) { + variance_inner_32(a, b, &ss, &s); + a += a_stride; + b += b_stride; + } + break; + case 64: + for (i = 0; i < h; ++i) { + variance_inner_32(a, b, &ss, &s); + variance_inner_32(a + 32, b + 32, &ss, &s); + + a += a_stride; + b += b_stride; + } + break; + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste(s, 0, sum); + + ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)ss, 0, sse); +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse, int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } + +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define VARIANCES(W, H) VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index cb06a476f..573d6fef1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -69,6 +69,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm +DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c endif # CONFIG_POSTPROC DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 93ecd7c19..9661f3bd8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -626,7 +626,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; - specialize qw/vpx_iwht4x4_16_add sse2/; + specialize qw/vpx_iwht4x4_16_add sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. @@ -702,7 +702,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/; + specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; @@ -1082,64 +1082,64 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq " # Variance # add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi/; + specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa mmi/; + specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi/; + specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa mmi/; + specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x8 sse2 neon msa mmi/; + specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance4x4 sse2 neon msa mmi/; + specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/; # # Specialty Variance # add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get16x16var sse2 avx2 neon msa/; + specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/; add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vpx_get8x8var sse2 neon msa/; + specialize qw/vpx_get8x8var sse2 neon msa vsx/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x8 sse2 avx2 msa mmi/; + specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x16 sse2 msa mmi/; + specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse8x8 sse2 msa mmi/; + specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; @@ -1598,13 +1598,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_plane_add_noise sse2 msa/; add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_down sse2 neon msa/; + specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/; add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/; + specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/; } diff --git a/vpx_ports/config.h b/vpx_ports/config.h deleted file mode 100644 index 3c1ab99f4..000000000 --- a/vpx_ports/config.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VPX_PORTS_CONFIG_H_ -#define VPX_PORTS_CONFIG_H_ - -#include "vpx_config.h" - -#endif // VPX_PORTS_CONFIG_H_ |