summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build/make/configure.sh49
-rw-r--r--test/sad_test.cc14
-rw-r--r--vp8/encoder/onyx_if.c8
-rw-r--r--vp8/encoder/onyx_int.h3
-rw-r--r--vp8/vp8_cx_iface.c11
-rw-r--r--vp9/common/vp9_rtcd_defs.pl6
-rw-r--r--vp9/encoder/arm/neon/vp9_sad4d_neon.c226
-rw-r--r--vp9/encoder/vp9_bitstream.c21
-rw-r--r--vp9/encoder/vp9_denoiser.c7
-rw-r--r--vp9/encoder/vp9_encodeframe.c18
-rw-r--r--vp9/encoder/vp9_encoder.c12
-rw-r--r--vp9/encoder/vp9_ratectrl.c21
-rw-r--r--vp9/encoder/vp9_ratectrl.h2
-rw-r--r--vp9/vp9cx.mk1
14 files changed, 317 insertions, 82 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c527cd527..099793810 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -606,6 +606,13 @@ setup_gnu_toolchain() {
EXE_SFX=
}
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+ xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+ xcodebuild -sdk $1 -version Path 2>/dev/null
+}
+
process_common_toolchain() {
if [ -z "$toolchain" ]; then
gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -729,31 +736,17 @@ process_common_toolchain() {
IOS_VERSION_MIN="6.0"
# Handle darwin variants. Newer SDKs allow targeting older
- # platforms, so find the newest SDK available.
+ # platforms, so use the newest one available.
case ${toolchain} in
*-darwin*)
- if [ -z "${DEVELOPER_DIR}" ]; then
- DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
- [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
- fi
- if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
- OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
- OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
- OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
- for v in ${OSX_SDK_VERSIONS}; do
- if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
- osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
- fi
- done
+ osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+ if [ -d "${osx_sdk_dir}" ]; then
+ add_cflags "-isysroot ${osx_sdk_dir}"
+ add_ldflags "-isysroot ${osx_sdk_dir}"
fi
;;
esac
- if [ -d "${osx_sdk_dir}" ]; then
- add_cflags "-isysroot ${osx_sdk_dir}"
- add_ldflags "-isysroot ${osx_sdk_dir}"
- fi
-
case ${toolchain} in
*-darwin8-*)
add_cflags "-mmacosx-version-min=10.4"
@@ -786,9 +779,11 @@ process_common_toolchain() {
*-iphonesimulator-*)
add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
- osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
- add_cflags "-isysroot ${osx_sdk_dir}"
- add_ldflags "-isysroot ${osx_sdk_dir}"
+ iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+ if [ -d "${iossim_sdk_dir}" ]; then
+ add_cflags "-isysroot ${iossim_sdk_dir}"
+ add_ldflags "-isysroot ${iossim_sdk_dir}"
+ fi
;;
esac
@@ -960,7 +955,7 @@ EOF
;;
darwin*)
- XCRUN_FIND="xcrun --sdk iphoneos -find"
+ XCRUN_FIND="xcrun --sdk iphoneos --find"
CXX="$(${XCRUN_FIND} clang++)"
CC="$(${XCRUN_FIND} clang)"
AR="$(${XCRUN_FIND} ar)"
@@ -987,10 +982,14 @@ EOF
# options that were put in above
ASFLAGS="-arch ${tgt_isa} -g"
- alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
- add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
+ add_cflags -arch ${tgt_isa}
add_ldflags -arch ${tgt_isa}
+ alt_libc="$(show_darwin_sdk_path iphoneos)"
+ if [ -d "${alt_libc}" ]; then
+ add_cflags -isysroot ${alt_libc}
+ fi
+
if [ "${LD}" = "${CXX}" ]; then
add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
else
diff --git a/test/sad_test.cc b/test/sad_test.cc
index eef8c750a..65e9561a9 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1234,14 +1234,24 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
-#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
+#if HAVE_AVX2
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
make_tuple(32, 32, sad_32x32x4d_avx2, -1),
make_tuple(64, 64, sad_64x64x4d_avx2, -1)));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_AVX2
+#if HAVE_NEON
+const SadMxNx4Func sad_16x16x4d_neon = vp9_sad16x16x4d_neon;
+const SadMxNx4Func sad_32x32x4d_neon = vp9_sad32x32x4d_neon;
+const SadMxNx4Func sad_64x64x4d_neon = vp9_sad64x64x4d_neon;
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::Values(
+ make_tuple(16, 16, sad_16x16x4d_neon, -1),
+ make_tuple(32, 32, sad_32x32x4d_neon, -1),
+ make_tuple(64, 64, sad_64x64x4d_neon, -1)));
+#endif // HAVE_NEON
+#endif // CONFIG_VP9_ENCODER
+
} // namespace
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 41b30663a..258fa114f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1760,8 +1760,16 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
}
+ if (!cpi->initial_width)
+ {
+ cpi->initial_width = cpi->oxcf.Width;
+ cpi->initial_height = cpi->oxcf.Height;
+ }
+
cm->Width = cpi->oxcf.Width;
cm->Height = cpi->oxcf.Height;
+ assert(cm->Width <= cpi->initial_width);
+ assert(cm->Height <= cpi->initial_height);
/* TODO(jkoleszar): if an internal spatial resampling is active,
* and we downsize the input image, maybe we should clear the
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b1a749c1d..82d745390 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -665,6 +665,9 @@ typedef struct VP8_COMP
int droppable;
+ int initial_width;
+ int initial_height;
+
#if CONFIG_TEMPORAL_DENOISING
VP8_DENOISER denoiser;
#endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index f81f07821..96b4cb5f2 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -447,9 +447,14 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
{
vpx_codec_err_t res;
- if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
- && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
- ERROR("Cannot change width or height after initialization");
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+ {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+ (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ ERROR("Cannot increast width or height larger than their initial values");
+ }
/* Prevent increasing lag_in_frames. This check is stricter than it needs
* to be -- the limit is not increasing past the first lag_in_frames
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 12f076fed..a1b15e8c3 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1049,7 +1049,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const
specialize qw/vp9_sad4x4x8 sse4/;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
@@ -1064,10 +1064,10 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
+specialize qw/vp9_sad16x16x4d sse2 neon/;
add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x8x4d sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
new file mode 100644
index 000000000..cec1689f1
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+ const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+ vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+ vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8x16_t vec_src_32,
+ const uint8x16_t vec_src_48,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+ const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+ vget_low_u8(vec_ref_32));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+ vget_high_u8(vec_ref_32));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+ vget_low_u8(vec_ref_48));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+ vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+}
+
+void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 64; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+ const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 32; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+ sad_neon_32(vec_src_00, vec_src_16, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t vec_src = vld1q_u8(src);
+ const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+ const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+ const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+ const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+ vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref0));
+ vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref0));
+ vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref1));
+ vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref1));
+ vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref2));
+ vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref2));
+ vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref3));
+ vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref3));
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3f4ed94d6..a72856db4 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -611,12 +611,10 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
case ONE_LOOP_REDUCED: {
int updates = 0;
int noupdates_before_first = 0;
-
if (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8) {
vp9_write_bit(bc, 0);
return;
}
-
for (i = 0; i < PLANE_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
for (k = 0; k < COEF_BANDS; ++k) {
@@ -668,7 +666,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
}
return;
}
-
default:
assert(0);
}
@@ -678,16 +675,14 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
const TX_MODE tx_mode = cpi->common.tx_mode;
const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
TX_SIZE tx_size;
- vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
- vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
- for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
- build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
-
- for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+ vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+ vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+ build_tree_distribution(cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ }
}
static void encode_loopfilter(struct loopfilter *lf,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 7d4e26aaf..4f245e249 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -403,10 +403,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
ctx->zeromv_sse = UINT_MAX;
- // This should be initialized as zero since mode search stage might skip
- // NEWMV mode if inferred motion vector modes provide sufficiently good
- // prediction quality.
- ctx->newmv_sse = 0;
+ ctx->newmv_sse = UINT_MAX;
}
void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
@@ -418,7 +415,7 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
}
- if (mode == NEWMV) {
+ if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
ctx->newmv_sse = sse;
ctx->best_sse_inter_mode = mode;
ctx->best_sse_mv = mbmi->mv[0];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 89714ac3a..730a229ca 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -490,18 +490,23 @@ static void choose_partitioning(VP9_COMP *cpi,
int pixels_wide = 64, pixels_high = 64;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-
// Always use 4x4 partition for key frame.
int use_4x4_partition = (cm->frame_type == KEY_FRAME);
-
int variance4x4downsample[16];
int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
- int64_t threshold_base = (int64_t)(threshold_multiplier *
+ int64_t threshold_base;
+ int64_t threshold;
+ int64_t threshold_bsize_min;
+ int64_t threshold_bsize_max;
+
+ vp9_clear_system_state();
+ threshold_base = (int64_t)(threshold_multiplier *
vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
- int64_t threshold = threshold_base;
- int64_t threshold_bsize_min = threshold_base << 6;
- int64_t threshold_bsize_max = threshold_base;
+ threshold = threshold_base;
+ threshold_bsize_min = threshold_base << 6;
+ threshold_bsize_max = threshold_base;
+
// Modify thresholds for key frame and for low-resolutions (set lower
// thresholds to favor split).
if (cm->frame_type == KEY_FRAME) {
@@ -512,7 +517,6 @@ static void choose_partitioning(VP9_COMP *cpi,
threshold_bsize_max = threshold_base >> 2;
}
- vp9_clear_system_state();
set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
if (xd->mb_to_right_edge < 0)
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c85bf2a0e..e93842726 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2955,18 +2955,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
if (undershoot_seen || loop_count > 1) {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low + 1) / 2;
} else {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
while (q < q_low && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
retries++;
@@ -2979,10 +2979,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
q_high = q > q_low ? q - 1 : q_low;
if (overshoot_seen || loop_count > 1) {
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low) / 2;
} else {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
// Special case reset for qlow for constrained quality.
@@ -2995,7 +2995,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
}
while (q > q_high && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
retries++;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 21f4cce03..11da367b9 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -396,7 +396,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
}
}
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
const VP9_COMMON *const cm = &cpi->common;
int correction_factor = 100;
double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -431,19 +431,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
// More heavily damped adjustment used if we have been oscillating either side
// of target.
- switch (damp_var) {
- case 0:
- adjustment_limit = 0.75;
- break;
- case 1:
- adjustment_limit = 0.25 +
- 0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
- break;
- case 2:
- default:
- adjustment_limit = 0.25;
- break;
- }
+ adjustment_limit = 0.25 +
+ 0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
cpi->rc.q_2_frame = cpi->rc.q_1_frame;
cpi->rc.q_1_frame = cm->base_qindex;
@@ -1222,9 +1211,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
rc->projected_frame_size = (int)(bytes_used << 3);
// Post encode loop adjustment of Q prediction.
- vp9_rc_update_rate_correction_factors(
- cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) ? 2 :
- ((oxcf->rc_mode == VPX_CBR) ? 1 : 0));
+ vp9_rc_update_rate_correction_factors(cpi);
// Keep a record of last Q and ambient average Q.
if (cm->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index a53f4e0a2..9774127a9 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -160,7 +160,7 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
// Updates rate correction factors
// Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
// Decide if we should drop this frame: For 1-pass CBR.
// Changes only the decimation count in the rate control structure
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 33a1e6735..fbdd4bad5 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,6 +152,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c