15 files changed, 413 insertions, 47 deletions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index a7ad9f0c6..be3e7b2f1 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -579,7 +579,7 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
       vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 6a5d6bb98..567e5f698 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, FwdTrans8x8HT,
     ::testing::Values(
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7d9441d54..ef7f61b12 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,8 +108,8 @@ extern "C"
          * For temporal denoiser: noise_sensitivity = 0 means off,
          * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
          * noise_sensitivity = 2 means temporal denoiser on for all channels.
-         * noise_sensitivity = 3 will be used for aggressive mode in future.
-         * Temporal denoiser is enabled via the build option
+         * noise_sensitivity = 3 means aggressive denoising mode.
+         * Temporal denoiser is enabled via the configuration option:
          * CONFIG_TEMPORAL_DENOISING.
          * For spatial denoiser: noise_sensitivity controls the amount of
          * pre-processing blur: noise_sensitivity = 0 means off.
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 1a401a4b9..c4c0de81b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+
 #include "denoising.h"
 
 #include "vp8/common/reconinter.h"
@@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     return FILTER_BLOCK;
 }
 
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) {
+  if (!denoiser->aggressive_mode) {
+    denoiser->denoise_pars.scale_sse_thresh = 1;
+    denoiser->denoise_pars.scale_motion_thresh = 8;
+    denoiser->denoise_pars.scale_increase_filter = 0;
+    denoiser->denoise_pars.denoise_mv_bias = 95;
+    denoiser->denoise_pars.pickmode_mv_bias = 100;
+    denoiser->denoise_pars.qp_thresh = 0;
+    denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+  } else {
+    denoiser->denoise_pars.scale_sse_thresh = 2;
+    denoiser->denoise_pars.scale_motion_thresh = 16;
+    denoiser->denoise_pars.scale_increase_filter = 1;
+    denoiser->denoise_pars.denoise_mv_bias = 60;
+    denoiser->denoise_pars.pickmode_mv_bias = 60;
+    denoiser->denoise_pars.qp_thresh = 100;
+    denoiser->denoise_pars.consec_zerolast = 10;
+  }
+}
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols)
+                          int num_mb_rows, int num_mb_cols, int mode)
 {
     int i;
     assert(denoiser);
     denoiser->num_mb_cols = num_mb_cols;
+    denoiser->aggressive_mode = mode;
 
     for (i = 0; i < MAX_REF_FRAMES; i++)
     {
@@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
     vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
-
+    vp8_denoiser_set_parameters(denoiser);
     return 0;
 }
 
+
 void vp8_denoiser_free(VP8_DENOISER *denoiser)
 {
     int i;
@@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
 {
     int mv_row;
     int mv_col;
+    unsigned int motion_threshold;
     unsigned int motion_magnitude2;
     unsigned int sse_thresh;
     int sse_diff_thresh = 0;
@@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
         int sse_diff = 0;
         // Bias on zero motion vector sse.
-        int zero_bias = 95;
+        const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
         sse_diff = zero_mv_sse - best_sse;
 
@@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     mv_row = x->best_sse_mv.as_mv.row;
     mv_col = x->best_sse_mv.as_mv.col;
     motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
-    sse_thresh = SSE_THRESHOLD;
-    if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+    motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+        NOISE_MOTION_THRESHOLD;
 
-    if (best_sse > sse_thresh || motion_magnitude2
-           > 8 * NOISE_MOTION_THRESHOLD)
-    {
-        decision = COPY_BLOCK;
-    }
+    if (motion_magnitude2 <
+        denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+      x->increase_denoising = 1;
+
+    sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+    if (x->increase_denoising)
+      sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+    if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+      decision = COPY_BLOCK;
 
     if (decision == FILTER_BLOCK)
     {
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index a1f195b72..1a42f86d3 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state {
   kFilterNonZeroMV
 };
 
+typedef struct {
+  // Scale factor on sse threshold above which no denoising is done.
+  unsigned int scale_sse_thresh;
+  // Scale factor on motion magnitude threshold above which no
+  // denoising is done.
+  unsigned int scale_motion_thresh;
+  // Scale factor on motion magnitude below which we increase the strength of
+  // the temporal filter (in function vp8_denoiser_filter).
+  unsigned int scale_increase_filter;
+  // Scale factor to bias to ZEROMV for denoising.
+  unsigned int denoise_mv_bias;
+  // Scale factor to bias to ZEROMV for coding mode selection.
+  unsigned int pickmode_mv_bias;
+  // Quantizer threshold below which we use the segmentation map to switch off
+  // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+  // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+  // 0 when segmentation map is used for shutting off loop filter.
+  unsigned int qp_thresh;
+  // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+  unsigned int consec_zerolast;
+} denoise_params;
+
 typedef struct vp8_denoiser
 {
     YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
     YV12_BUFFER_CONFIG yv12_mc_running_avg;
     unsigned char* denoise_state;
     int num_mb_cols;
+    int aggressive_mode;
+    denoise_params denoise_pars;
 } VP8_DENOISER;
 
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols);
+                          int num_mb_rows, int num_mb_cols, int mode);
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser);
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e6b0f9b64..aec6b9880 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi,
             }
 
 #endif
+            // Keep track of how many (consecutive) times a block is coded
+            // as ZEROMV_LASTREF, for base layer frames.
+            // Reset to 0 if its coded as anything else.
+            if (cpi->current_layer == 0) {
+              if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                  xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+                // Increment, check for wrap-around.
+                if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                  cpi->consec_zero_last[map_index+mb_col] += 1;
+              } else {
+                cpi->consec_zero_last[map_index+mb_col] = 0;
+              }
+            }
 
             /* Special case code for cyclic refresh
              * If cyclic update enabled then copy xd->mbmi.segment_id; (which
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d4b17cef1..7b8b51f30 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         }
 
 #endif
+                        // Keep track of how many (consecutive) times a  block
+                        // is coded as ZEROMV_LASTREF, for base layer frames.
+                        // Reset to 0 if its coded as anything else.
+                        if (cpi->current_layer == 0) {
+                          if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                              xd->mode_info_context->mbmi.ref_frame ==
+                                  LAST_FRAME) {
+                            // Increment, check for wrap-around.
+                            if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                              cpi->consec_zero_last[map_index+mb_col] +=
+                                  1;
+                          } else {
+                            cpi->consec_zero_last[map_index+mb_col] = 0;
+                          }
+                        }
 
                         /* Special case code for cyclic refresh
                          * If cyclic update enabled then copy
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 469d0d6e9..e81c05e6f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
         while(block_count && i != cpi->cyclic_refresh_mode_index);
 
         cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->denoiser.aggressive_mode != 0 &&
+            Q < cpi->denoiser.denoise_pars.qp_thresh) {
+          // Under aggressive denoising mode, use segmentation to turn off loop
+          // filter below some qp thresh. The loop filter is turned off for all
+          // blocks that have been encoded as ZEROMV LAST x frames in a row,
+          // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+          // This is to avoid "dot" artifacts that can occur from repeated
+          // loop filtering on noisy input source.
+          cpi->cyclic_refresh_q = Q;
+          lf_adjustment = -MAX_LOOP_FILTER;
+          for (i = 0; i < mbs_in_frame; ++i) {
+            seg_map[i] = (cpi->consec_zero_last[i] >
+                          cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+          }
+        }
+#endif
     }
 
     /* Activate segmentation. */
@@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
         int width = (cpi->oxcf.Width + 15) & ~15;
         int height = (cpi->oxcf.Height + 15) & ~15;
         vp8_denoiser_allocate(&cpi->denoiser, width, height,
-                              cpi->common.mb_rows, cpi->common.mb_cols);
+                              cm->mb_rows, cm->mb_cols,
+                              ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0));
       }
     }
 #endif
@@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     else
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
+    CHECK_MEM_ERROR(cpi->consec_zero_last,
+                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
 #ifdef VP8_ENTROPY_STATS
     init_context_counters();
 #endif
@@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
+    vpx_free(cpi->consec_zero_last);
 
     vp8_remove_common(&cpi->common);
     vpx_free(cpi);
@@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate
         {
             cpi->mb.rd_thresh_mult[i] = 128;
         }
+
+        // Reset the zero_last counter to 0 on key frame.
+        vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
     }
 
 #if 0
@@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate
 
 #endif
 
+
 #ifdef OUTPUT_YUV_SRC
     vp8_write_yuv_frame(yuv_file, cpi->Source);
 #endif
@@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate
                 else
                   disable_segmentation(cpi);
               }
+              // Reset the consec_zero_last counter on key frame.
+              vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
               vp8_set_quantizer(cpi, Q);
             }
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index df17dff34..7a8baca77 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP
     int cyclic_refresh_mode_index;
     int cyclic_refresh_q;
     signed char *cyclic_refresh_map;
+    // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+    unsigned char *consec_zero_last;
 
     // Frame counter for the temporal pattern. Counter is rest when the temporal
     // layers are changed dynamically (run-time change).
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 86108b70a..ec1ea146f 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 
-
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
      */
     calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      rd_adjustment = (int)(rd_adjustment *
+          cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+    }
+#endif
+
     /* if we encode a new mv this is important
      * find the best new motion vector
      */
@@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-        int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
+        int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
         if (x->best_sse_inter_mode == DC_PRED)
         {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 666afa4dc..d58774924 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -757,10 +757,10 @@ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stri
 specialize qw/vp9_fdct4x4 sse2/;
 
 add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2/;
+specialize qw/vp9_fdct8x8_1 sse2 neon/;
 
 add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2/, "$ssse3_x86_64";
+specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct16x16_1 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vp9_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fe15edfdc..1b574758b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
 
       vp9_clear_system_state();
 
@@ -600,7 +603,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
       if (cpi->use_fp_mb_stats) {
         // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
       }
 #endif
 
@@ -704,26 +707,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
           // intra predication statistics
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-              FPMB_DCINTRA_MASK;
-          cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-              (~FPMB_NONZERO_MOTION_MASK);
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] &=
+              ~FPMB_NONZERO_MOTION_MASK;
           if (this_error > FPMB_ERROR_LEVEL4_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL4_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL3_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL2_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK;
           } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL1_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK;
           } else {
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
-                FPMB_ERROR_LEVEL0_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK;
           }
         }
 #endif
@@ -759,25 +756,24 @@ void vp9_first_pass(VP9_COMP *cpi) {
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
             // inter predication statistics
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-                (~FPMB_DCINTRA_MASK);
-            cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
-                (~FPMB_NONZERO_MOTION_MASK);
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                ~FPMB_NONZERO_MOTION_MASK;
             if (this_error > FPMB_ERROR_LEVEL4_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL4_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL3_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL2_MASK;
             } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL1_MASK;
             } else {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_ERROR_LEVEL0_MASK;
             }
           }
@@ -788,7 +784,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
 #if CONFIG_FP_MB_STATS
             if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
                   FPMB_NONZERO_MOTION_MASK;
             }
 #endif
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3381cb95a..0ea28d313 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -130,5 +130,6 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 182892acf..bd9eebd64 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -73,6 +73,33 @@ static void once(void (*func)(void))
 }
 
 
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void))
+{
+    static int done;
+
+    /* If the initialization is complete, return early. */
+    if(done)
+        return;
+
+    /* Causes all other threads in the process to block themselves
+     * and give up their time slice.
+     */
+    DosEnterCritSec();
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    /* Restores normal thread dispatching for the current process. */
+    DosExitCritSec();
+}
+
+
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void))