21 files changed, 611 insertions, 85 deletions
diff --git a/build/make/Makefile b/build/make/Makefile
index 9efa0ec02..ed90397f0 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -118,20 +118,26 @@ testdata::
 utiltest:
 
 # Add compiler flags for intrinsic files
+ifeq ($(TOOLCHAIN), x86-os2-gcc)
+STACKREALIGN=-mstackrealign
+else
+STACKREALIGN=
+endif
+
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index d25f31333..6bc8509ab 100755..100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -381,8 +381,8 @@ EOF
 
 # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
 check_gcc_machine_option() {
-    local opt="$1"
-    local feature="$2"
+    opt="$1"
+    feature="$2"
     [ -n "$feature" ] || feature="$opt"
 
     if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
@@ -419,8 +419,8 @@ true
 }
 
 write_common_target_config_mk() {
-    local CC="${CC}"
-    local CXX="${CXX}"
+    saved_CC="${CC}"
+    saved_CXX="${CXX}"
     enabled ccache && CC="ccache ${CC}"
     enabled ccache && CXX="ccache ${CXX}"
     print_webm_license $1 "##" ""
@@ -470,6 +470,8 @@ EOF
 
     enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
 
+    CC="${saved_CC}"
+    CXX="${saved_CXX}"
 }
 
 
@@ -1314,8 +1316,9 @@ process_toolchain() {
 }
 
 print_config_mk() {
-    local prefix=$1
-    local makefile=$2
+    saved_prefix="${prefix}"
+    prefix=$1
+    makefile=$2
     shift 2
     for cfg; do
         if enabled $cfg; then
@@ -1323,11 +1326,13 @@ print_config_mk() {
             echo "${prefix}_${upname}=yes" >> $makefile
         fi
     done
+    prefix="${saved_prefix}"
 }
 
 print_config_h() {
-    local prefix=$1
-    local header=$2
+    saved_prefix="${prefix}"
+    prefix=$1
+    header=$2
     shift 2
     for cfg; do
         upname="`toupper $cfg`"
@@ -1337,10 +1342,11 @@ print_config_h() {
             echo "#define ${prefix}_${upname} 0" >> $header
         fi
     done
+    prefix="${saved_prefix}"
 }
 
 print_config_vars_h() {
-    local header=$1
+    header=$1
     shift
     while [ $# -gt 0 ]; do
         upname="`toupper $1`"
@@ -1350,9 +1356,10 @@ print_config_vars_h() {
 }
 
 print_webm_license() {
-    local destination=$1
-    local prefix="$2"
-    local suffix="$3"
+    saved_prefix="${prefix}"
+    destination=$1
+    prefix="$2"
+    suffix="$3"
     shift 3
     cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1363,6 +1370,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
 ${prefix} in the file PATENTS.  All contributing project authors may${suffix}
 ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
 EOF
+    prefix="${saved_prefix}"
 }
 
 process_targets() {
diff --git a/configure b/configure
index d570081d4..d650eeb70 100755
--- a/configure
+++ b/configure
@@ -67,10 +67,10 @@ Codecs:
 EOF
 #restore editor state '
 
-    local family;
-    local last_family;
-    local c;
-    local str;
+    family="";
+    last_family="";
+    c="";
+    str="";
     for c in ${CODECS}; do
         family=${c%_*}
         if [ "${family}" != "${last_family}" ]; then
@@ -412,7 +412,7 @@ process_cmdline() {
 }
 
 post_process_cmdline() {
-    local c
+    c=""
 
     # If the codec family is disabled, disable all components of that family.
     # If the codec family is enabled, enable all components of that family.
@@ -459,8 +459,8 @@ process_targets() {
     enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
 
     # Calculate the default distribution name, based on the enabled features
-    local cf
-    local DIST_DIR=vpx
+    cf=""
+    DIST_DIR=vpx
     for cf in $CODEC_FAMILIES; do
         if enabled ${cf}_encoder && enabled ${cf}_decoder; then
             DIST_DIR="${DIST_DIR}-${cf}"
@@ -482,7 +482,7 @@ process_targets() {
           ;;
     esac
     if [ -f "${source_path}/build/make/version.sh" ]; then
-        local ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
+        ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
         DIST_DIR="${DIST_DIR}-${ver}"
         VERSION_STRING=${ver}
         ver=${ver%%-*}
@@ -516,7 +516,7 @@ EOF
     # Write makefiles for all enabled targets
     #
     for tgt in libs examples docs solution; do
-        local tgt_fn="$tgt-$toolchain.mk"
+        tgt_fn="$tgt-$toolchain.mk"
 
         if enabled $tgt; then
             echo "Creating makefiles for ${toolchain} ${tgt}"
@@ -555,7 +555,7 @@ process_detect() {
                     true;
                 ;;
                 *)
-                    local result=false
+                    result=false
                     for d in "$@"; do
                         [ -f "${d##-I}/$header" ] && result=true && break
                     done
@@ -604,7 +604,7 @@ process_toolchain() {
     # Handle universal binaries for this architecture
     case $toolchain in
         universal-darwin*)
-            local darwin_ver=${tgt_os##darwin}
+            darwin_ver=${tgt_os##darwin}
 
             # Snow Leopard (10.6/darwin10) dropped support for PPC
             # Include PPC support for all prior versions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index a7ad9f0c6..be3e7b2f1 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -579,7 +579,7 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
       vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 6a5d6bb98..567e5f698 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, FwdTrans8x8HT,
     ::testing::Values(
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9dc7c6a45..83b7435e6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -756,6 +756,18 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
                       make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
 #endif  // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9VarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+    vp9_sub_pixel_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace vp9
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7d9441d54..ef7f61b12 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,8 +108,8 @@ extern "C"
          * For temporal denoiser: noise_sensitivity = 0 means off,
          * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
          * noise_sensitivity = 2 means temporal denoiser on for all channels.
-         * noise_sensitivity = 3 will be used for aggressive mode in future.
-         * Temporal denoiser is enabled via the build option
+         * noise_sensitivity = 3 means aggressive denoising mode.
+         * Temporal denoiser is enabled via the configuration option:
          * CONFIG_TEMPORAL_DENOISING.
          * For spatial denoiser: noise_sensitivity controls the amount of
          * pre-processing blur: noise_sensitivity = 0 means off.
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 1a401a4b9..c4c0de81b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+
 #include "denoising.h"
 
 #include "vp8/common/reconinter.h"
@@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     return FILTER_BLOCK;
 }
 
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) {
+  if (!denoiser->aggressive_mode) {
+    denoiser->denoise_pars.scale_sse_thresh = 1;
+    denoiser->denoise_pars.scale_motion_thresh = 8;
+    denoiser->denoise_pars.scale_increase_filter = 0;
+    denoiser->denoise_pars.denoise_mv_bias = 95;
+    denoiser->denoise_pars.pickmode_mv_bias = 100;
+    denoiser->denoise_pars.qp_thresh = 0;
+    denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+  } else {
+    denoiser->denoise_pars.scale_sse_thresh = 2;
+    denoiser->denoise_pars.scale_motion_thresh = 16;
+    denoiser->denoise_pars.scale_increase_filter = 1;
+    denoiser->denoise_pars.denoise_mv_bias = 60;
+    denoiser->denoise_pars.pickmode_mv_bias = 60;
+    denoiser->denoise_pars.qp_thresh = 100;
+    denoiser->denoise_pars.consec_zerolast = 10;
+  }
+}
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols)
+                          int num_mb_rows, int num_mb_cols, int mode)
 {
     int i;
     assert(denoiser);
     denoiser->num_mb_cols = num_mb_cols;
+    denoiser->aggressive_mode = mode;
 
     for (i = 0; i < MAX_REF_FRAMES; i++)
     {
@@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
     vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
-
+    vp8_denoiser_set_parameters(denoiser);
     return 0;
 }
 
+
 void vp8_denoiser_free(VP8_DENOISER *denoiser)
 {
     int i;
@@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
 {
     int mv_row;
     int mv_col;
+    unsigned int motion_threshold;
     unsigned int motion_magnitude2;
     unsigned int sse_thresh;
     int sse_diff_thresh = 0;
@@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
         int sse_diff = 0;
         // Bias on zero motion vector sse.
-        int zero_bias = 95;
+        const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
         sse_diff = zero_mv_sse - best_sse;
 
@@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     mv_row = x->best_sse_mv.as_mv.row;
     mv_col = x->best_sse_mv.as_mv.col;
     motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
-    sse_thresh = SSE_THRESHOLD;
-    if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+    motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+        NOISE_MOTION_THRESHOLD;
 
-    if (best_sse > sse_thresh || motion_magnitude2
-           > 8 * NOISE_MOTION_THRESHOLD)
-    {
-        decision = COPY_BLOCK;
-    }
+    if (motion_magnitude2 <
+        denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+      x->increase_denoising = 1;
+
+    sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+    if (x->increase_denoising)
+      sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+    if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+      decision = COPY_BLOCK;
 
     if (decision == FILTER_BLOCK)
     {
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index a1f195b72..1a42f86d3 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state {
   kFilterNonZeroMV
 };
 
+typedef struct {
+  // Scale factor on sse threshold above which no denoising is done.
+  unsigned int scale_sse_thresh;
+  // Scale factor on motion magnitude threshold above which no
+  // denoising is done.
+  unsigned int scale_motion_thresh;
+  // Scale factor on motion magnitude below which we increase the strength of
+  // the temporal filter (in function vp8_denoiser_filter).
+  unsigned int scale_increase_filter;
+  // Scale factor to bias to ZEROMV for denoising.
+  unsigned int denoise_mv_bias;
+  // Scale factor to bias to ZEROMV for coding mode selection.
+  unsigned int pickmode_mv_bias;
+  // Quantizer threshold below which we use the segmentation map to switch off
+  // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+  // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+  // 0 when segmentation map is used for shutting off loop filter.
+  unsigned int qp_thresh;
+  // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+  unsigned int consec_zerolast;
+} denoise_params;
+
 typedef struct vp8_denoiser
 {
     YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
     YV12_BUFFER_CONFIG yv12_mc_running_avg;
     unsigned char* denoise_state;
     int num_mb_cols;
+    int aggressive_mode;
+    denoise_params denoise_pars;
 } VP8_DENOISER;
 
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols);
+                          int num_mb_rows, int num_mb_cols, int mode);
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser);
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e6b0f9b64..aec6b9880 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi,
             }
 
 #endif
+            // Keep track of how many (consecutive) times a block is coded
+            // as ZEROMV_LASTREF, for base layer frames.
+            // Reset to 0 if its coded as anything else.
+            if (cpi->current_layer == 0) {
+              if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                  xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+                // Increment, check for wrap-around.
+                if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                  cpi->consec_zero_last[map_index+mb_col] += 1;
+              } else {
+                cpi->consec_zero_last[map_index+mb_col] = 0;
+              }
+            }
 
             /* Special case code for cyclic refresh
              * If cyclic update enabled then copy xd->mbmi.segment_id; (which
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d4b17cef1..7b8b51f30 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         }
 
 #endif
+                        // Keep track of how many (consecutive) times a  block
+                        // is coded as ZEROMV_LASTREF, for base layer frames.
+                        // Reset to 0 if its coded as anything else.
+                        if (cpi->current_layer == 0) {
+                          if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                              xd->mode_info_context->mbmi.ref_frame ==
+                                  LAST_FRAME) {
+                            // Increment, check for wrap-around.
+                            if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                              cpi->consec_zero_last[map_index+mb_col] +=
+                                  1;
+                          } else {
+                            cpi->consec_zero_last[map_index+mb_col] = 0;
+                          }
+                        }
 
                         /* Special case code for cyclic refresh
                          * If cyclic update enabled then copy
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 469d0d6e9..e81c05e6f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
         while(block_count && i != cpi->cyclic_refresh_mode_index);
 
         cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->denoiser.aggressive_mode != 0 &&
+            Q < cpi->denoiser.denoise_pars.qp_thresh) {
+          // Under aggressive denoising mode, use segmentation to turn off loop
+          // filter below some qp thresh. The loop filter is turned off for all
+          // blocks that have been encoded as ZEROMV LAST x frames in a row,
+          // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+          // This is to avoid "dot" artifacts that can occur from repeated
+          // loop filtering on noisy input source.
+          cpi->cyclic_refresh_q = Q;
+          lf_adjustment = -MAX_LOOP_FILTER;
+          for (i = 0; i < mbs_in_frame; ++i) {
+            seg_map[i] = (cpi->consec_zero_last[i] >
+                          cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+          }
+        }
+#endif
     }
 
     /* Activate segmentation. */
@@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
         int width = (cpi->oxcf.Width + 15) & ~15;
         int height = (cpi->oxcf.Height + 15) & ~15;
         vp8_denoiser_allocate(&cpi->denoiser, width, height,
-                              cpi->common.mb_rows, cpi->common.mb_cols);
+                              cm->mb_rows, cm->mb_cols,
+                              ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0));
       }
     }
 #endif
@@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     else
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
+    CHECK_MEM_ERROR(cpi->consec_zero_last,
+                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
 #ifdef VP8_ENTROPY_STATS
     init_context_counters();
 #endif
@@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
+    vpx_free(cpi->consec_zero_last);
 
     vp8_remove_common(&cpi->common);
     vpx_free(cpi);
@@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate
         {
             cpi->mb.rd_thresh_mult[i] = 128;
         }
+
+        // Reset the zero_last counter to 0 on key frame.
+        vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
     }
 
 #if 0
@@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate
 
 #endif
 
+
 #ifdef OUTPUT_YUV_SRC
     vp8_write_yuv_frame(yuv_file, cpi->Source);
 #endif
@@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate
                 else
                   disable_segmentation(cpi);
               }
+              // Reset the consec_zero_last counter on key frame.
+              vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
               vp8_set_quantizer(cpi, Q);
             }
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index df17dff34..7a8baca77 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP
     int cyclic_refresh_mode_index;
     int cyclic_refresh_q;
     signed char *cyclic_refresh_map;
+    // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+    unsigned char *consec_zero_last;
 
     // Frame counter for the temporal pattern. Counter is rest when the temporal
     // layers are changed dynamically (run-time change).
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 86108b70a..ec1ea146f 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 
-
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
      */
     calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      rd_adjustment = (int)(rd_adjustment *
+          cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+    }
+#endif
+
     /* if we encode a new mv this is important
      * find the best new motion vector
      */
@@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-        int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
+        int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
         if (x->best_sse_inter_mode == DC_PRED)
         {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 666afa4dc..d3d874dfc 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -420,7 +420,7 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc
 specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,7 +435,7 @@ add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
 specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -483,7 +483,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_
 specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -757,10 +757,10 @@ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stri
 specialize qw/vp9_fdct4x4 sse2/;
 
 add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2/;
+specialize qw/vp9_fdct8x8_1 sse2 neon/;
 
 add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2/, "$ssse3_x86_64";
+specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct16x16_1 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vp9_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 000000000..f6871188b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,129 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kPixelStepOne = 1 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+                   kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const int16_t *vp9_filter) {
+  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]);
+    const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+    const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+    const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+    const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+    const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+    const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+    vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi));
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16);
+  DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight16PlusOne, kWidth16,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+                             kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fe15edfdc..1b574758b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
 
       vp9_clear_system_state();
 
@@ -600,7 +603,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
       if (cpi->use_fp_mb_stats) {
         // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
       }
 #endif
 
@@ -704,26 +707,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
           // intra predication statistics
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-              FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-              (~FPMB_NONZERO_MOTION_MASK);
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] &=
+              ~FPMB_NONZERO_MOTION_MASK;
           if (this_error > FPMB_ERROR_LEVEL4_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL4_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL3_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL2_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL1_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK;
           } else {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL0_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK;
           }
         }
 #endif
@@ -759,25 +756,24 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
             // inter predication statistics
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-                (~FPMB_DCINTRA_MASK);
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-                (~FPMB_NONZERO_MOTION_MASK);
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                ~FPMB_NONZERO_MOTION_MASK;
             if (this_error > FPMB_ERROR_LEVEL4_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL4_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL3_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL2_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL1_MASK;
             } else {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL0_MASK;
             }
           }
@@ -788,7 +784,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
 #if CONFIG_FP_MB_STATS
             if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_NONZERO_MOTION_MASK;
             }
 #endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 96473f5a1..7a1600155 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -410,6 +410,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   PRED_BUFFER *this_mode_pred = NULL;
   int i;
 
+  // CTX is used by the temporal denoiser which is currently being developed.
+  // TODO(jbb): when temporal denoiser is finished and in the default build
+  // remove the following line;
+  (void) ctx;
   if (cpi->sf.reuse_inter_pred_sby) {
     for (i = 0; i < 3; i++) {
       tmp[i].data = &pred_buf[pixels_in_block * i];
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3381cb95a..77b968bda 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -130,5 +130,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 182892acf..bd9eebd64 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -73,6 +73,33 @@ static void once(void (*func)(void))
 }
 
 
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void))
+{
+    static int done;
+
+    /* If the initialization is complete, return early. */
+    if(done)
+        return;
+
+    /* Causes all other threads in the process to block themselves
+     * and give up their time slice.
+     */
+    DosEnterCritSec();
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    /* Restores normal thread dispatching for the current process. */
+    DosExitCritSec();
+}
+
+
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void))