diff options
-rw-r--r-- | build/make/configure.sh | 49 | ||||
-rw-r--r-- | test/sad_test.cc | 14 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 8 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 3 | ||||
-rw-r--r-- | vp8/vp8_cx_iface.c | 11 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_sad4d_neon.c | 226 | ||||
-rw-r--r-- | vp9/encoder/vp9_bitstream.c | 21 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 7 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 18 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 21 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.h | 2 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 |
14 files changed, 317 insertions, 82 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index c527cd527..099793810 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -606,6 +606,13 @@ setup_gnu_toolchain() { EXE_SFX= } +# Reliably find the newest available Darwin SDKs. (Older versions of +# xcrun don't support --show-sdk-path.) +show_darwin_sdk_path() { + xcrun --sdk $1 --show-sdk-path 2>/dev/null || + xcodebuild -sdk $1 -version Path 2>/dev/null +} + process_common_toolchain() { if [ -z "$toolchain" ]; then gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" @@ -729,31 +736,17 @@ process_common_toolchain() { IOS_VERSION_MIN="6.0" # Handle darwin variants. Newer SDKs allow targeting older - # platforms, so find the newest SDK available. + # platforms, so use the newest one available. case ${toolchain} in *-darwin*) - if [ -z "${DEVELOPER_DIR}" ]; then - DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null` - [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1 - fi - if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then - OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs" - OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk" - OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk" - for v in ${OSX_SDK_VERSIONS}; do - if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then - osx_sdk_dir="${OSX_SDK_ROOTS}/${v}" - fi - done + osx_sdk_dir="$(show_darwin_sdk_path macosx)" + if [ -d "${osx_sdk_dir}" ]; then + add_cflags "-isysroot ${osx_sdk_dir}" + add_ldflags "-isysroot ${osx_sdk_dir}" fi ;; esac - if [ -d "${osx_sdk_dir}" ]; then - add_cflags "-isysroot ${osx_sdk_dir}" - add_ldflags "-isysroot ${osx_sdk_dir}" - fi - case ${toolchain} in *-darwin8-*) add_cflags "-mmacosx-version-min=10.4" @@ -786,9 +779,11 @@ process_common_toolchain() { *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" - osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)" - add_cflags "-isysroot ${osx_sdk_dir}" - add_ldflags "-isysroot ${osx_sdk_dir}" + iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)" + if [ -d "${iossim_sdk_dir}" ]; then + add_cflags "-isysroot ${iossim_sdk_dir}" + add_ldflags "-isysroot ${iossim_sdk_dir}" + fi ;; esac @@ -960,7 +955,7 @@ EOF ;; darwin*) - XCRUN_FIND="xcrun --sdk iphoneos -find" + XCRUN_FIND="xcrun --sdk iphoneos --find" CXX="$(${XCRUN_FIND} clang++)" CC="$(${XCRUN_FIND} clang)" AR="$(${XCRUN_FIND} ar)" @@ -987,10 +982,14 @@ EOF # options that were put in above ASFLAGS="-arch ${tgt_isa} -g" - alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)" - add_cflags -arch ${tgt_isa} -isysroot ${alt_libc} + add_cflags -arch ${tgt_isa} add_ldflags -arch ${tgt_isa} + alt_libc="$(show_darwin_sdk_path iphoneos)" + if [ -d "${alt_libc}" ]; then + add_cflags -isysroot ${alt_libc} + fi + if [ "${LD}" = "${CXX}" ]; then add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}" else diff --git a/test/sad_test.cc b/test/sad_test.cc index eef8c750a..65e9561a9 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1234,14 +1234,24 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( #endif // CONFIG_USE_X86INC #endif // HAVE_SSSE3 -#if HAVE_AVX2 #if CONFIG_VP9_ENCODER +#if HAVE_AVX2 const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values( make_tuple(32, 32, sad_32x32x4d_avx2, -1), make_tuple(64, 64, sad_64x64x4d_avx2, -1))); -#endif // CONFIG_VP9_ENCODER #endif // HAVE_AVX2 +#if HAVE_NEON +const SadMxNx4Func sad_16x16x4d_neon = vp9_sad16x16x4d_neon; +const SadMxNx4Func sad_32x32x4d_neon = vp9_sad32x32x4d_neon; +const SadMxNx4Func sad_64x64x4d_neon = vp9_sad64x64x4d_neon; +INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::Values( + make_tuple(16, 16, sad_16x16x4d_neon, -1), + make_tuple(32, 32, sad_32x32x4d_neon, -1), + make_tuple(64, 64, sad_64x64x4d_neon, -1))); +#endif // HAVE_NEON +#endif // CONFIG_VP9_ENCODER + } // namespace diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 41b30663a..258fa114f 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1760,8 +1760,16 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); } + if (!cpi->initial_width) + { + cpi->initial_width = cpi->oxcf.Width; + cpi->initial_height = cpi->oxcf.Height; + } + cm->Width = cpi->oxcf.Width; cm->Height = cpi->oxcf.Height; + assert(cm->Width <= cpi->initial_width); + assert(cm->Height <= cpi->initial_height); /* TODO(jkoleszar): if an internal spatial resampling is active, * and we downsize the input image, maybe we should clear the diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index b1a749c1d..82d745390 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -665,6 +665,9 @@ typedef struct VP8_COMP int droppable; + int initial_width; + int initial_height; + #if CONFIG_TEMPORAL_DENOISING VP8_DENOISER denoiser; #endif diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index f81f07821..96b4cb5f2 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -447,9 +447,14 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, { vpx_codec_err_t res; - if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h)) - && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)) - ERROR("Cannot change width or height after initialization"); + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) + { + if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) + ERROR("Cannot change width or height after initialization"); + if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || + (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + ERROR("Cannot increast width or height larger than their initial values"); + } /* Prevent increasing lag_in_frames. This check is stricter than it needs * to be -- the limit is not increasing past the first lag_in_frames diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 12f076fed..a1b15e8c3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1049,7 +1049,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vp9_sad4x4x8 sse4/; add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x4d sse2 avx2/; +specialize qw/vp9_sad64x64x4d sse2 avx2 neon/; add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad32x64x4d sse2/; @@ -1064,10 +1064,10 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co specialize qw/vp9_sad16x32x4d sse2/; add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x4d sse2 avx2/; +specialize qw/vp9_sad32x32x4d sse2 avx2 neon/; add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad16x16x4d sse2/; +specialize qw/vp9_sad16x16x4d sse2 neon/; add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad16x8x4d sse2/; diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vp9/encoder/arm/neon/vp9_sad4d_neon.c new file mode 100644 index 000000000..cec1689f1 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_sad4d_neon.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), + vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), + vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, +// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo +// and vec_sum_ref_hi. +static void sad_neon_64(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8x16_t vec_src_32, + const uint8x16_t vec_src_48, + const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); +} + +// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, +// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. +static void sad_neon_32(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); +} + +void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + unsigned int *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + unsigned int *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + + sad_neon_32(vec_src_00, vec_src_16, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t* const ref[4], int ref_stride, + unsigned int *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref0 = vld1q_u8(ref0); + const uint8x16_t vec_ref1 = vld1q_u8(ref1); + const uint8x16_t vec_ref2 = vld1q_u8(ref2); + const uint8x16_t vec_ref3 = vld1q_u8(ref3); + + vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref0)); + vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref0)); + vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref1)); + vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref1)); + vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref2)); + vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref2)); + vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref3)); + vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref3)); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 3f4ed94d6..a72856db4 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -611,12 +611,10 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, case ONE_LOOP_REDUCED: { int updates = 0; int noupdates_before_first = 0; - if (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8) { vp9_write_bit(bc, 0); return; } - for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { @@ -668,7 +666,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, } return; } - default: assert(0); } @@ -678,16 +675,14 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) { const TX_MODE tx_mode = cpi->common.tx_mode; const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; TX_SIZE tx_size; - vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES]; - vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; - - for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size) - build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size], - frame_coef_probs[tx_size]); - - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size], - frame_coef_probs[tx_size]); + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) { + vp9_coeff_stats frame_branch_ct[PLANE_TYPES]; + vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES]; + build_tree_distribution(cpi, tx_size, frame_branch_ct, + frame_coef_probs); + update_coef_probs_common(w, cpi, tx_size, frame_branch_ct, + frame_coef_probs); + } } static void encode_loopfilter(struct loopfilter *lf, diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 7d4e26aaf..4f245e249 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -403,10 +403,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->zeromv_sse = UINT_MAX; - // This should be initialized as zero since mode search stage might skip - // NEWMV mode if inferred motion vector modes provide sufficiently good - // prediction quality. - ctx->newmv_sse = 0; + ctx->newmv_sse = UINT_MAX; } void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, @@ -418,7 +415,7 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, ctx->best_zeromv_reference_frame = mbmi->ref_frame[0]; } - if (mode == NEWMV) { + if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) { ctx->newmv_sse = sse; ctx->best_sse_inter_mode = mode; ctx->best_sse_mv = mbmi->mv[0]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 89714ac3a..730a229ca 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -490,18 +490,23 @@ static void choose_partitioning(VP9_COMP *cpi, int pixels_wide = 64, pixels_high = 64; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - // Always use 4x4 partition for key frame. int use_4x4_partition = (cm->frame_type == KEY_FRAME); - int variance4x4downsample[16]; int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0; const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4; - int64_t threshold_base = (int64_t)(threshold_multiplier * + int64_t threshold_base; + int64_t threshold; + int64_t threshold_bsize_min; + int64_t threshold_bsize_max; + + vp9_clear_system_state(); + threshold_base = (int64_t)(threshold_multiplier * vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth)); - int64_t threshold = threshold_base; - int64_t threshold_bsize_min = threshold_base << 6; - int64_t threshold_bsize_max = threshold_base; + threshold = threshold_base; + threshold_bsize_min = threshold_base << 6; + threshold_bsize_max = threshold_base; + // Modify thresholds for key frame and for low-resolutions (set lower // thresholds to favor split). if (cm->frame_type == KEY_FRAME) { @@ -512,7 +517,6 @@ static void choose_partitioning(VP9_COMP *cpi, threshold_bsize_max = threshold_base >> 2; } - vp9_clear_system_state(); set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); if (xd->mb_to_right_edge < 0) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index c85bf2a0e..e93842726 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2955,18 +2955,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, if (undershoot_seen || loop_count > 1) { // Update rate_correction_factor unless - vp9_rc_update_rate_correction_factors(cpi, 1); + vp9_rc_update_rate_correction_factors(cpi); q = (q_high + q_low + 1) / 2; } else { // Update rate_correction_factor unless - vp9_rc_update_rate_correction_factors(cpi, 0); + vp9_rc_update_rate_correction_factors(cpi); q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, MAX(q_high, top_index)); while (q < q_low && retries < 10) { - vp9_rc_update_rate_correction_factors(cpi, 0); + vp9_rc_update_rate_correction_factors(cpi); q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, MAX(q_high, top_index)); retries++; @@ -2979,10 +2979,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, q_high = q > q_low ? q - 1 : q_low; if (overshoot_seen || loop_count > 1) { - vp9_rc_update_rate_correction_factors(cpi, 1); + vp9_rc_update_rate_correction_factors(cpi); q = (q_high + q_low) / 2; } else { - vp9_rc_update_rate_correction_factors(cpi, 0); + vp9_rc_update_rate_correction_factors(cpi); q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index); // Special case reset for qlow for constrained quality. @@ -2995,7 +2995,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, } while (q > q_high && retries < 10) { - vp9_rc_update_rate_correction_factors(cpi, 0); + vp9_rc_update_rate_correction_factors(cpi); q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, top_index); retries++; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 21f4cce03..11da367b9 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -396,7 +396,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { } } -void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { +void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { const VP9_COMMON *const cm = &cpi->common; int correction_factor = 100; double rate_correction_factor = get_rate_correction_factor(cpi); @@ -431,19 +431,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { // More heavily damped adjustment used if we have been oscillating either side // of target. - switch (damp_var) { - case 0: - adjustment_limit = 0.75; - break; - case 1: - adjustment_limit = 0.25 + - 0.5 * MIN(1, fabs(log10(0.01 * correction_factor))); - break; - case 2: - default: - adjustment_limit = 0.25; - break; - } + adjustment_limit = 0.25 + + 0.5 * MIN(1, fabs(log10(0.01 * correction_factor))); cpi->rc.q_2_frame = cpi->rc.q_1_frame; cpi->rc.q_1_frame = cm->base_qindex; @@ -1222,9 +1211,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->projected_frame_size = (int)(bytes_used << 3); // Post encode loop adjustment of Q prediction. - vp9_rc_update_rate_correction_factors( - cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) ? 2 : - ((oxcf->rc_mode == VPX_CBR) ? 1 : 0)); + vp9_rc_update_rate_correction_factors(cpi); // Keep a record of last Q and ambient average Q. if (cm->frame_type == KEY_FRAME) { diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index a53f4e0a2..9774127a9 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -160,7 +160,7 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); // Updates rate correction factors // Changes only the rate correction factors in the rate control structure. -void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var); +void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi); // Decide if we should drop this frame: For 1-pass CBR. // Changes only the decimation count in the rate control structure diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 33a1e6735..fbdd4bad5 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -152,6 +152,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c |