14 files changed, 317 insertions, 82 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c527cd527..099793810 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -606,6 +606,13 @@ setup_gnu_toolchain() {
   EXE_SFX=
 }
 
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+  xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+    xcodebuild -sdk $1 -version Path 2>/dev/null
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -729,31 +736,17 @@ process_common_toolchain() {
   IOS_VERSION_MIN="6.0"
 
   # Handle darwin variants. Newer SDKs allow targeting older
-  # platforms, so find the newest SDK available.
+  # platforms, so use the newest one available.
   case ${toolchain} in
     *-darwin*)
-      if [ -z "${DEVELOPER_DIR}" ]; then
-        DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
-        [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
-      fi
-      if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
-        OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
-        OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
-        OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
-        for v in ${OSX_SDK_VERSIONS}; do
-          if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
-            osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
-          fi
-        done
+      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+      if [ -d "${osx_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${osx_sdk_dir}"
+        add_ldflags "-isysroot ${osx_sdk_dir}"
       fi
       ;;
   esac
 
-  if [ -d "${osx_sdk_dir}" ]; then
-    add_cflags  "-isysroot ${osx_sdk_dir}"
-    add_ldflags "-isysroot ${osx_sdk_dir}"
-  fi
-
   case ${toolchain} in
     *-darwin8-*)
       add_cflags  "-mmacosx-version-min=10.4"
@@ -786,9 +779,11 @@ process_common_toolchain() {
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
       add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
-      osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
-      add_cflags  "-isysroot ${osx_sdk_dir}"
-      add_ldflags "-isysroot ${osx_sdk_dir}"
+      iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+      if [ -d "${iossim_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iossim_sdk_dir}"
+        add_ldflags "-isysroot ${iossim_sdk_dir}"
+      fi
       ;;
   esac
 
@@ -960,7 +955,7 @@ EOF
           ;;
 
         darwin*)
-          XCRUN_FIND="xcrun --sdk iphoneos -find"
+          XCRUN_FIND="xcrun --sdk iphoneos --find"
           CXX="$(${XCRUN_FIND} clang++)"
           CC="$(${XCRUN_FIND} clang)"
           AR="$(${XCRUN_FIND} ar)"
@@ -987,10 +982,14 @@ EOF
           # options that were put in above
           ASFLAGS="-arch ${tgt_isa} -g"
 
-          alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
-          add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
+          add_cflags -arch ${tgt_isa}
           add_ldflags -arch ${tgt_isa}
 
+          alt_libc="$(show_darwin_sdk_path iphoneos)"
+          if [ -d "${alt_libc}" ]; then
+            add_cflags -isysroot ${alt_libc}
+          fi
+
           if [ "${LD}" = "${CXX}" ]; then
             add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
           else
diff --git a/test/sad_test.cc b/test/sad_test.cc
index eef8c750a..65e9561a9 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1234,14 +1234,24 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
 
-#if HAVE_AVX2
 #if CONFIG_VP9_ENCODER
+#if HAVE_AVX2
 const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
 const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
                         make_tuple(32, 32, sad_32x32x4d_avx2, -1),
                         make_tuple(64, 64, sad_64x64x4d_avx2, -1)));
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+const SadMxNx4Func sad_16x16x4d_neon = vp9_sad16x16x4d_neon;
+const SadMxNx4Func sad_32x32x4d_neon = vp9_sad32x32x4d_neon;
+const SadMxNx4Func sad_64x64x4d_neon = vp9_sad64x64x4d_neon;
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16x4d_neon, -1),
+                        make_tuple(32, 32, sad_32x32x4d_neon, -1),
+                        make_tuple(64, 64, sad_64x64x4d_neon, -1)));
+#endif  // HAVE_NEON
+#endif  // CONFIG_VP9_ENCODER
+
 }  // namespace
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 41b30663a..258fa114f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1760,8 +1760,16 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
         reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
     }
 
+    if (!cpi->initial_width)
+    {
+        cpi->initial_width = cpi->oxcf.Width;
+        cpi->initial_height = cpi->oxcf.Height;
+    }
+
     cm->Width       = cpi->oxcf.Width;
     cm->Height      = cpi->oxcf.Height;
+    assert(cm->Width <= cpi->initial_width);
+    assert(cm->Height <= cpi->initial_height);
 
     /* TODO(jkoleszar): if an internal spatial resampling is active,
      * and we downsize the input image, maybe we should clear the
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b1a749c1d..82d745390 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -665,6 +665,9 @@ typedef struct VP8_COMP
 
     int droppable;
 
+    int initial_width;
+    int initial_height;
+
 #if CONFIG_TEMPORAL_DENOISING
     VP8_DENOISER denoiser;
 #endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index f81f07821..96b4cb5f2 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -447,9 +447,14 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
 {
     vpx_codec_err_t res;
 
-    if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
-        && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
-        ERROR("Cannot change width or height after initialization");
+    if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+    {
+        if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+            ERROR("Cannot change width or height after initialization");
+        if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+            (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+            ERROR("Cannot increast width or height larger than their initial values");
+    }
 
     /* Prevent increasing lag_in_frames. This check is stricter than it needs
      * to be -- the limit is not increasing past the first lag_in_frames
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 12f076fed..a1b15e8c3 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1049,7 +1049,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const
 specialize qw/vp9_sad4x4x8 sse4/;
 
 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
 
 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -1064,10 +1064,10 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, co
 specialize qw/vp9_sad16x32x4d sse2/;
 
 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2 neon/;
 
 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
+specialize qw/vp9_sad16x16x4d sse2 neon/;
 
 add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x8x4d sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
new file mode 100644
index 000000000..cec1689f1
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+                                        vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+                                        vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          unsigned int *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          unsigned int *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          unsigned int *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3f4ed94d6..a72856db4 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -611,12 +611,10 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
     case ONE_LOOP_REDUCED: {
       int updates = 0;
       int noupdates_before_first = 0;
-
       if (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8) {
         vp9_write_bit(bc, 0);
         return;
       }
-
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -668,7 +666,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       }
       return;
     }
-
     default:
       assert(0);
   }
@@ -678,16 +675,14 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
-  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
-    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
-                            frame_coef_probs[tx_size]);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
-                             frame_coef_probs[tx_size]);
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+    vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+    vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                            frame_coef_probs);
+    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                             frame_coef_probs);
+  }
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 7d4e26aaf..4f245e249 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -403,10 +403,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
   ctx->zeromv_sse = UINT_MAX;
-  // This should be initialized as zero since mode search stage might skip
-  // NEWMV mode if inferred motion vector modes provide sufficiently good
-  // prediction quality.
-  ctx->newmv_sse = 0;
+  ctx->newmv_sse = UINT_MAX;
 }
 
 void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
@@ -418,7 +415,7 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
     ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
   }
 
-  if (mode == NEWMV) {
+  if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
     ctx->newmv_sse = sse;
     ctx->best_sse_inter_mode = mode;
     ctx->best_sse_mv = mbmi->mv[0];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 89714ac3a..730a229ca 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -490,18 +490,23 @@ static void choose_partitioning(VP9_COMP *cpi,
   int pixels_wide = 64, pixels_high = 64;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
   const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-
   // Always use 4x4 partition for key frame.
   int use_4x4_partition = (cm->frame_type == KEY_FRAME);
-
   int variance4x4downsample[16];
   int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
   const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
-  int64_t threshold_base = (int64_t)(threshold_multiplier *
+  int64_t threshold_base;
+  int64_t threshold;
+  int64_t threshold_bsize_min;
+  int64_t threshold_bsize_max;
+
+  vp9_clear_system_state();
+  threshold_base = (int64_t)(threshold_multiplier *
       vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
-  int64_t threshold = threshold_base;
-  int64_t threshold_bsize_min = threshold_base << 6;
-  int64_t threshold_bsize_max = threshold_base;
+  threshold = threshold_base;
+  threshold_bsize_min = threshold_base << 6;
+  threshold_bsize_max = threshold_base;
+
   // Modify thresholds for key frame and for low-resolutions (set lower
   // thresholds to favor split).
   if (cm->frame_type == KEY_FRAME) {
@@ -512,7 +517,6 @@ static void choose_partitioning(VP9_COMP *cpi,
     threshold_bsize_max = threshold_base >> 2;
   }
 
-  vp9_clear_system_state();
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c85bf2a0e..e93842726 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2955,18 +2955,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
 
           if (undershoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
@@ -2979,10 +2979,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_count > 1) {
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
@@ -2995,7 +2995,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
             }
 
             while (q > q_high && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 21f4cce03..11da367b9 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -396,7 +396,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   }
 }
 
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -431,19 +431,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
 
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
-  switch (damp_var) {
-    case 0:
-      adjustment_limit = 0.75;
-      break;
-    case 1:
-      adjustment_limit = 0.25 +
-          0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
-      break;
-    case 2:
-    default:
-      adjustment_limit = 0.25;
-      break;
-  }
+  adjustment_limit = 0.25 +
+      0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
 
   cpi->rc.q_2_frame = cpi->rc.q_1_frame;
   cpi->rc.q_1_frame = cm->base_qindex;
@@ -1222,9 +1211,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) ? 2 :
-            ((oxcf->rc_mode == VPX_CBR) ? 1 : 0));
+  vp9_rc_update_rate_correction_factors(cpi);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index a53f4e0a2..9774127a9 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -160,7 +160,7 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 
 // Updates rate correction factors
 // Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 33a1e6735..fbdd4bad5 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,6 +152,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c