17 files changed, 893 insertions, 405 deletions
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index bc68b3756..5f1c8ce74 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -319,6 +319,20 @@ void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) {
   cr->sb_index = i;
 }
 
+// Set/update global/frame level cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame. This corresponds to ~40
+  // frames with cr->percent_refresh = 10.
+  if (rc->frames_since_key <  40)
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
+}
+
 // Setup cyclic background refresh: set delta q and segmentation map.
 void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -343,9 +357,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
     vp9_clear_system_state();
-    // Some of these parameters may be set via codec-control function later.
-    cr->percent_refresh = 10;
-    cr->rate_ratio_qdelta = 2.0;
     cr->max_qdelta_perc = 50;
     cr->min_block_size = BLOCK_8X8;
     cr->time_for_refresh = 0;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 3fc677646..656d7605b 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -53,6 +53,9 @@ void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
 // Update the actual number of blocks that were applied the segment delta q.
 void vp9_cyclic_refresh_update_actual_count(struct VP9_COMP *const cpi);
 
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
+
 // Setup cyclic background refresh: set delta q and segmentation map.
 void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
 
diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 144936d54..be6f7e4ee 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c
@@ -19,15 +19,15 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-1)
+#define ENERGY_MIN (-4)
 #define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
 static const double rate_ratio[MAX_SEGMENTS] =
-  {1.143, 1.0, 0.875, 1.0, 1.0, 1.0, 1.0, 1.0};
-static const int segment_id[ENERGY_SPAN] = {0, 1, 2};
+  {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
 
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b9cfc42fb..8d2f14f2c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -36,6 +36,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rd.h"
@@ -3497,7 +3498,7 @@ static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
          cm->show_frame;
 }
 
-static void init_tile_data(VP9_COMP *cpi) {
+void vp9_init_tile_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -3535,36 +3536,40 @@ static void init_tile_data(VP9_COMP *cpi) {
   }
 }
 
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
+                     int tile_row, int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile =
+      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo * const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    if (cpi->sf.use_nonrd_pick_mode)
+      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+    else
+      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+      allocated_tokens(*tile_info));
+}
+
 static void encode_tiles(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
 
-  init_tile_data(cpi);
+  vp9_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const TileInfo * const tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
-      int mi_row;
-      TileDataEnc *this_tile =
-          &cpi->tile_data[tile_row * tile_cols + tile_col];
-
-      for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-           mi_row += MI_BLOCK_SIZE) {
-        if (cpi->sf.use_nonrd_pick_mode)
-          encode_nonrd_sb_row(cpi, &cpi->td, this_tile, mi_row, &tok);
-        else
-          encode_rd_sb_row(cpi, &cpi->td, this_tile, mi_row, &tok);
-      }
-      cpi->tok_count[tile_row][tile_col] =
-          (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-      assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-          allocated_tokens(*tile_info));
-    }
-  }
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+      vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
 #if CONFIG_FP_MB_STATS
@@ -3677,7 +3682,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   }
 #endif
 
-    encode_tiles(cpi);
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+      vp9_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index fd1c9aa64..556f3a5a1 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -19,6 +19,7 @@ extern "C" {
 struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;
+struct ThreadData;
 
 // Constants used in SOURCE_VAR_BASED_PARTITION
 #define VAR_HIST_MAX_BG_VAR 1000
@@ -33,6 +34,10 @@ void vp9_setup_src_planes(struct macroblock *x,
 
 void vp9_encode_frame(struct VP9_COMP *cpi);
 
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
+                     int tile_row, int tile_col);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index ef5bb5ace..9b2165be6 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -652,10 +652,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
     return;
   }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (!x->skip_recode)
-    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-#else
   if (!x->skip_recode) {
     if (x->quant_fp) {
       // Encoding process for rtc mode
@@ -687,7 +683,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       }
     }
   }
-#endif
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     const int ctx = combine_entropy_contexts(*a, *l);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e82d5d8df..aee362ae4 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -35,6 +35,7 @@
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -1728,6 +1729,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   unsigned int i;
+  int t;
 
   if (!cpi)
     return;
@@ -1800,6 +1802,24 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
 #endif
 
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VP9Worker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Deallocate allocated threads.
+    vp9_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+
+    vpx_free(worker->data1);
+  }
+  vpx_free(cpi->workers);
+
   dealloc_compressor_data(cpi);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
@@ -2476,6 +2496,7 @@ void vp9_scale_references(VP9_COMP *cpi) {
         if (cm->frame_bufs[new_fb].mvs == NULL ||
             cm->frame_bufs[new_fb].mi_rows < cm->mi_rows ||
             cm->frame_bufs[new_fb].mi_cols < cm->mi_cols) {
+          vpx_free(cm->frame_bufs[new_fb].mvs);
           cm->frame_bufs[new_fb].mvs =
             (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                  sizeof(*cm->frame_bufs[new_fb].mvs));
@@ -3217,11 +3238,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   vp9_clear_system_state();
 
 #if CONFIG_INTERNAL_STATS
-  {
-    int i;
-    for (i = 0; i < MAX_MODES; ++i)
-      cpi->mode_chosen_counts[i] = 0;
-  }
+  vpx_memset(cpi->mode_chosen_counts, 0,
+             MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index b75f491df..7342f7496 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
@@ -216,6 +217,8 @@ typedef struct VP9EncoderConfig {
   int tile_columns;
   int tile_rows;
 
+  int max_threads;
+
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -442,6 +445,10 @@ typedef struct VP9_COMP {
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
+
+  // Multi-threading
+  int num_workers;
+  VP9Worker *workers;
 } VP9_COMP;
 
 void vp9_initialize_enc();
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
new file mode 100644
index 000000000..daf3da44c
--- /dev/null
+++ b/vp9/encoder/vp9_ethread.c
@@ -0,0 +1,272 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+
+static void accumulate_frame_counts(VP9_COMMON *cm, ThreadData *td) {
+  int i, j, k, l, m;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.y_mode[i][j] += td->counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.uv_mode[i][j] += td->counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      cm->counts.partition[i][j] += td->counts->partition[i][j];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            cm->counts.eob_branch[i][j][k][l][m] +=
+                td->counts->eob_branch[i][j][k][l][m];
+              // cm->counts.coef is only updated at frame level, so not need
+              // to accumulate it here.
+              // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+              //   cm->counts.coef[i][j][k][l][m][n] +=
+              //       td->counts->coef[i][j][k][l][m][n];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      cm->counts.switchable_interp[i][j] += td->counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      cm->counts.inter_mode[i][j] += td->counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.intra_inter[i][j] += td->counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_inter[i][j] += td->counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      cm->counts.single_ref[i][j][k] += td->counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_ref[i][j] += td->counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      cm->counts.tx.p32x32[i][j] += td->counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      cm->counts.tx.p16x16[i][j] += td->counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      cm->counts.tx.p8x8[i][j] += td->counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.skip[i][j] += td->counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    cm->counts.mv.joints[i] += td->counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *comps = &cm->counts.mv.comps[k];
+    nmv_component_counts *comps_t = &td->counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+}
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+  for (i = 0; i < TX_MODES; i++)
+    td->rd_counts.tx_select_diff[i] += td_t->rd_counts.tx_select_diff[i];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int t;
+
+  (void) unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+      t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    vpx_malloc(num_workers * sizeof(*cpi->workers)));
+
+    for (i = 0; i < num_workers; i++) {
+      VP9Worker *const worker = &cpi->workers[i];
+      EncWorkerData *thread_data;
+
+      ++cpi->num_workers;
+
+      winterface->init(worker);
+      CHECK_MEM_ERROR(cm, worker->data1,
+                      (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData)));
+      thread_data = (EncWorkerData*)worker->data1;
+
+      if (i < num_workers - 1) {
+      thread_data->cpi = cpi;
+
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      vpx_calloc(1, sizeof(*thread_data->td)));
+      // Set up pc_tree.
+      thread_data->td->leaf_tree = NULL;
+      thread_data->td->pc_tree = NULL;
+      vp9_setup_pc_tree(cm, thread_data->td);
+
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Create threads
+      if (!winterface->reset(worker))
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->cpi = cpi;
+        thread_data->td = &cpi->td;
+      }
+
+      // data2 is unused.
+      worker->data2 = NULL;
+
+      winterface->sync(worker);
+      worker->hook = (VP9WorkerHook)enc_worker_hook;
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    thread_data->td->mb = cpi->td.mb;
+    thread_data->td->rd_counts = cpi->td.rd_counts;
+    vpx_memcpy(thread_data->td->counts, &cpi->common.counts,
+               sizeof(cpi->common.counts));
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Accumulate counters.
+    if (i < num_workers - 1) {
+      accumulate_frame_counts(&cpi->common, thread_data->td);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
new file mode 100644
index 000000000..e87c50bc7
--- /dev/null
+++ b/vp9/encoder/vp9_ethread.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ETHREAD_H_
+#define VP9_ENCODER_VP9_ETHREAD_H_
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct VP9_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+#endif  // VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 5559f8a7b..81334e448 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -153,7 +153,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
-#if CONFIG_VP9_HIGHDEPTH
+#if CONFIG_VP9_HIGHBITDEPTH
     int filt_guess;
     switch (cm->bit_depth) {
       case VPX_BITS_8:
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 30709e032..5f033fd20 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -852,9 +852,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
         this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                   this_mode_pred->data, this_mode_pred->stride,
+                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+        else
+          vp9_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#else
         vp9_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride,
                           NULL, 0, NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
         best_pred = this_mode_pred;
       }
     }
@@ -910,7 +921,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_convolve_copy(best_pred->data, best_pred->stride,
                         pd->dst.buf, pd->dst.stride, NULL, 0,
                         NULL, 0, bw, bh);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index e96c9046d..37b6718bf 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1483,6 +1483,12 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
   rc->frames_till_gf_update_due = INT_MAX;
   rc->baseline_gf_interval = INT_MAX;
@@ -1516,6 +1522,11 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
     rc->gfu_boost = DEFAULT_GF_BOOST;
   }
 
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_cbr(cpi);
   else
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 7ec126e4b..099993aa6 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -269,8 +269,9 @@ void FDCT32x32_2D(const int16_t *input,
           step1[30] = SUB_EPI16(in01, in30);
           step1[31] = SUB_EPI16(in00, in31);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step1[0], step1[1], step1[2],
-                        step1[3], step1[28], step1[29], step1[30], step1[31]);
+          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+                                             &step1[3], &step1[28], &step1[29],
+                                             &step1[30], &step1[31]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -295,9 +296,9 @@ void FDCT32x32_2D(const int16_t *input,
           step1[26] = SUB_EPI16(in05, in26);
           step1[27] = SUB_EPI16(in04, in27);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step1[4], step1[5], step1[6],
-                                             step1[7], step1[24], step1[25],
-                                             step1[26], step1[27]);
+          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+                                             &step1[7], &step1[24], &step1[25],
+                                             &step1[26], &step1[27]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -322,9 +323,9 @@ void FDCT32x32_2D(const int16_t *input,
           step1[22] = SUB_EPI16(in09, in22);
           step1[23] = SUB_EPI16(in08, in23);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step1[8], step1[9], step1[10],
-                                             step1[11], step1[20], step1[21],
-                                             step1[22], step1[23]);
+          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+                                             &step1[11], &step1[20], &step1[21],
+                                             &step1[22], &step1[23]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -349,9 +350,9 @@ void FDCT32x32_2D(const int16_t *input,
           step1[18] = SUB_EPI16(in13, in18);
           step1[19] = SUB_EPI16(in12, in19);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step1[12], step1[13], step1[14],
-                                             step1[15], step1[16], step1[17],
-                                             step1[18], step1[19]);
+          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+                                             &step1[15], &step1[16], &step1[17],
+                                             &step1[18], &step1[19]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -379,10 +380,10 @@ void FDCT32x32_2D(const int16_t *input,
         step2[15] = SUB_EPI16(step1[0], step1[15]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            step2[0], step2[1], step2[2], step2[3],
-            step2[4], step2[5], step2[6], step2[7],
-            step2[8], step2[9], step2[10], step2[11],
-            step2[12], step2[13], step2[14], step2[15]);
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -460,9 +461,9 @@ void FDCT32x32_2D(const int16_t *input,
         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step2[20], step2[21], step2[22],
-                                           step2[23], step2[24], step2[25],
-                                           step2[26], step2[27]);
+        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+                                           &step2[23], &step2[24], &step2[25],
+                                           &step2[26], &step2[27]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -544,14 +545,14 @@ void FDCT32x32_2D(const int16_t *input,
         step1[31] = SUB_EPI16(step1[31], s3_31_0);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x32(
-            step2[0], step2[1], step2[2], step2[3],
-            step2[4], step2[5], step2[6], step2[7],
-            step2[8], step2[9], step2[10], step2[11],
-            step2[12], step2[13], step2[14], step2[15],
-            step1[16], step1[17], step1[18], step1[19],
-            step2[20], step2[21], step2[22], step2[23],
-            step2[24], step2[25], step2[26], step2[27],
-            step1[28], step1[29], step1[30], step1[31]);
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15],
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
         if (overflow) {
           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
           return;
@@ -639,9 +640,9 @@ void FDCT32x32_2D(const int16_t *input,
         step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
         step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step3[0], step3[1], step3[2],
-                                           step3[3], step3[4], step3[5],
-                                           step3[6], step3[7]);
+        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                           &step3[3], &step3[4], &step3[5],
+                                           &step3[6], &step3[7]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -687,8 +688,8 @@ void FDCT32x32_2D(const int16_t *input,
         step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
         step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(step3[10], step3[11],
-                                           step3[12], step3[13]);
+        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+                                           &step3[12], &step3[13]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -717,10 +718,10 @@ void FDCT32x32_2D(const int16_t *input,
         step3[31] = ADD_EPI16(step2[24], step1[31]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            step3[16], step3[17], step3[18], step3[19],
-            step3[20], step3[21], step3[22], step3[23],
-            step3[24], step3[25], step3[26], step3[27],
-            step3[28], step3[29], step3[30], step3[31]);
+            &step3[16], &step3[17], &step3[18], &step3[19],
+            &step3[20], &step3[21], &step3[22], &step3[23],
+            &step3[24], &step3[25], &step3[26], &step3[27],
+            &step3[28], &step3[29], &step3[30], &step3[31]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -747,10 +748,10 @@ void FDCT32x32_2D(const int16_t *input,
         step1[15] = ADD_EPI16(step3[12], step2[15]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            step1[0], step1[1], step1[2], step1[3],
-            step1[4], step1[5], step1[6], step1[7],
-            step1[8], step1[9], step1[10], step1[11],
-            step1[12], step1[13], step1[14], step1[15]);
+            &step1[0], &step1[1], &step1[2], &step1[3],
+            &step1[4], &step1[5], &step1[6], &step1[7],
+            &step1[8], &step1[9], &step1[10], &step1[11],
+            &step1[12], &step1[13], &step1[14], &step1[15]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -780,7 +781,7 @@ void FDCT32x32_2D(const int16_t *input,
         step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
         step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x2(step1[5], step1[6]);
+        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -858,9 +859,9 @@ void FDCT32x32_2D(const int16_t *input,
         step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
         step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step1[18], step1[19], step1[20],
-                                           step1[21], step1[26], step1[27],
-                                           step1[28], step1[29]);
+        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                           &step1[21], &step1[26], &step1[27],
+                                           &step1[28], &step1[29]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -877,8 +878,8 @@ void FDCT32x32_2D(const int16_t *input,
         step2[6] = SUB_EPI16(step3[7], step1[6]);
         step2[7] = ADD_EPI16(step1[6], step3[7]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(step2[4], step2[5],
-                                           step2[6], step2[7]);
+        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+                                           &step2[6], &step2[7]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -924,7 +925,8 @@ void FDCT32x32_2D(const int16_t *input,
         out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
         out[24] = _mm_packs_epi32(out_24_6, out_24_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(out[0], out[16], out[8], out[24]);
+        overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                           &out[8], &out[24]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -970,8 +972,8 @@ void FDCT32x32_2D(const int16_t *input,
         step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
         step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(step2[9], step2[10],
-                                           step2[13], step2[14]);
+        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+                                           &step2[13], &step2[14]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1000,10 +1002,10 @@ void FDCT32x32_2D(const int16_t *input,
         step2[31] = ADD_EPI16(step1[28], step3[31]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            step2[16], step2[17], step2[18], step2[19],
-            step2[20], step2[21], step2[22], step2[23],
-            step2[24], step2[25], step2[26], step2[27],
-            step2[28], step2[29], step2[30], step2[31]);
+            &step2[16], &step2[17], &step2[18], &step2[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step2[28], &step2[29], &step2[30], &step2[31]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1054,7 +1056,8 @@ void FDCT32x32_2D(const int16_t *input,
         out[12] = _mm_packs_epi32(out_12_6, out_12_7);
         out[28] = _mm_packs_epi32(out_28_6, out_28_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(out[4], out[20], out[12], out[28]);
+        overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                           &out[12], &out[28]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1074,9 +1077,9 @@ void FDCT32x32_2D(const int16_t *input,
         step3[14] = SUB_EPI16(step1[15], step2[14]);
         step3[15] = ADD_EPI16(step2[14], step1[15]);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step3[8], step3[9], step3[10],
-                                           step3[11], step3[12], step3[13],
-                                           step3[14], step3[15]);
+        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                           &step3[11], &step3[12], &step3[13],
+                                           &step3[14], &step3[15]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1155,9 +1158,9 @@ void FDCT32x32_2D(const int16_t *input,
         step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
         step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step3[17], step3[18], step3[21],
-                                           step3[22], step3[25], step3[26],
-                                           step3[29], step3[30]);
+        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                           &step3[22], &step3[25], &step3[26],
+                                           &step3[29], &step3[30]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1236,8 +1239,9 @@ void FDCT32x32_2D(const int16_t *input,
         out[14] = _mm_packs_epi32(out_14_6, out_14_7);
         out[30] = _mm_packs_epi32(out_30_6, out_30_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(out[2], out[18], out[10], out[26],
-                                           out[6], out[22], out[14], out[30]);
+        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                           &out[26], &out[6], &out[22],
+                                           &out[14], &out[30]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1266,10 +1270,10 @@ void FDCT32x32_2D(const int16_t *input,
         step1[31] = ADD_EPI16(step3[30], step2[31]);
 #if DCT_HIGH_BIT_DEPTH
         overflow = check_epi16_overflow_x16(
-            step1[16], step1[17], step1[18], step1[19],
-            step1[20], step1[21], step1[22], step1[23],
-            step1[24], step1[25], step1[26], step1[27],
-            step1[28], step1[29], step1[30], step1[31]);
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step1[20], &step1[21], &step1[22], &step1[23],
+            &step1[24], &step1[25], &step1[26], &step1[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1348,8 +1352,9 @@ void FDCT32x32_2D(const int16_t *input,
         out[15] = _mm_packs_epi32(out_15_6, out_15_7);
         out[31] = _mm_packs_epi32(out_31_6, out_31_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(out[1], out[17], out[9], out[25],
-                                           out[7], out[23], out[15], out[31]);
+        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                           &out[25], &out[7], &out[23],
+                                           &out[15], &out[31]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1427,8 +1432,9 @@ void FDCT32x32_2D(const int16_t *input,
         out[11] = _mm_packs_epi32(out_11_6, out_11_7);
         out[27] = _mm_packs_epi32(out_27_6, out_27_7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(out[5], out[21], out[13], out[29],
-                                           out[3], out[19], out[11], out[27]);
+        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                           &out[29], &out[3], &out[19],
+                                           &out[11], &out[27]);
         if (overflow) {
           if (pass == 0)
             HIGH_FDCT32x32_2D_C(input, output_org, stride);
@@ -1697,8 +1703,8 @@ void FDCT32x32_2D(const int16_t *input,
         v[6] = k_madd_epi32(u[2], k32_p16_p16);
         v[7] = k_madd_epi32(u[3], k32_p16_p16);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = k_check_epi32_overflow_8(v[0], v[1], v[2], v[3], v[4], v[5],
-                                            v[6], v[7], &kZero);
+        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+                                            &v[4], &v[5], &v[6], &v[7], &kZero);
         if (overflow) {
           HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
           return;
@@ -1776,10 +1782,11 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
-              v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
-              v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -1883,8 +1890,9 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -1959,7 +1967,8 @@ void FDCT32x32_2D(const int16_t *input,
           out[ 8] = _mm_packs_epi32(u[4], u[5]);
           out[24] = _mm_packs_epi32(u[6], u[7]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(out[0], out[16], out[8], out[24]);
+          overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                             &out[8], &out[24]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -1999,8 +2008,9 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2110,8 +2120,9 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_16(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2185,7 +2196,8 @@ void FDCT32x32_2D(const int16_t *input,
           out[12] = _mm_packs_epi32(u[4], u[5]);
           out[28] = _mm_packs_epi32(u[6], u[7]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(out[4], out[20], out[12], out[28]);
+          overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                             &out[12], &out[28]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2271,10 +2283,11 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
-              v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
-              v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2394,10 +2407,11 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
-              v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
-              v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2531,8 +2545,9 @@ void FDCT32x32_2D(const int16_t *input,
           out[14] = _mm_packs_epi32(u[12], u[13]);
           out[30] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(out[2], out[18], out[10], out[26],
-                             out[6], out[22], out[14], out[30]);
+          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                             &out[26], &out[6], &out[22],
+                                             &out[14], &out[30]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2636,10 +2651,11 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
-              v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
-              v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2773,8 +2789,9 @@ void FDCT32x32_2D(const int16_t *input,
           out[15] = _mm_packs_epi32(u[12], u[13]);
           out[31] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(out[1], out[17], out[9], out[25],
-                                             out[7], out[23], out[15], out[31]);
+          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                             &out[25], &out[7], &out[23],
+                                             &out[15], &out[31]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2843,10 +2860,11 @@ void FDCT32x32_2D(const int16_t *input,
 
 #if DCT_HIGH_BIT_DEPTH
           overflow = k_check_epi32_overflow_32(
-              v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7],
-              v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15],
-              v[16], v[17], v[18], v[19], v[20], v[21], v[22], v[23],
-              v[24], v[25], v[26], v[27], v[28], v[29], v[30], v[31], &kZero);
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -2980,8 +2998,9 @@ void FDCT32x32_2D(const int16_t *input,
           out[11] = _mm_packs_epi32(u[12], u[13]);
           out[27] = _mm_packs_epi32(u[14], u[15]);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(out[5], out[21], out[13], out[29],
-                                             out[3], out[19], out[11], out[27]);
+          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                             &out[29], &out[3], &out[19],
+                                             &out[11], &out[27]);
           if (overflow) {
             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
             return;
@@ -3107,14 +3126,14 @@ void FDCT32x32_2D(const int16_t *input,
             // Process next 8x8
             output0 += 8;
           } else {
-            storeu_output(tr2_0, (output1 + 0 * 32));
-            storeu_output(tr2_1, (output1 + 1 * 32));
-            storeu_output(tr2_2, (output1 + 2 * 32));
-            storeu_output(tr2_3, (output1 + 3 * 32));
-            storeu_output(tr2_4, (output1 + 4 * 32));
-            storeu_output(tr2_5, (output1 + 5 * 32));
-            storeu_output(tr2_6, (output1 + 6 * 32));
-            storeu_output(tr2_7, (output1 + 7 * 32));
+            storeu_output(&tr2_0, (output1 + 0 * 32));
+            storeu_output(&tr2_1, (output1 + 1 * 32));
+            storeu_output(&tr2_2, (output1 + 2 * 32));
+            storeu_output(&tr2_3, (output1 + 3 * 32));
+            storeu_output(&tr2_4, (output1 + 4 * 32));
+            storeu_output(&tr2_5, (output1 + 5 * 32));
+            storeu_output(&tr2_6, (output1 + 6 * 32));
+            storeu_output(&tr2_7, (output1 + 7 * 32));
             // Process next 8x8
             output1 += 8;
           }
diff --git a/vp9/encoder/x86/vp9_dct_impl_sse2.c b/vp9/encoder/x86/vp9_dct_impl_sse2.c
index 3fdde83da..12fa747e8 100644
--- a/vp9/encoder/x86/vp9_dct_impl_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_impl_sse2.c
@@ -75,7 +75,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   // This second rounding constant saves doing some extra adds at the end
   const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
                                                +(DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i in0, in1;
@@ -170,7 +170,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
     const __m128i x0 = _mm_packs_epi32(w0, w1);
     const __m128i x1 = _mm_packs_epi32(w2, w3);
 #if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(x0, x1);
+    overflow = check_epi16_overflow_x2(&x0, &x1);
     if (overflow) {
       vp9_highbd_fdct4x4_c(input, output, stride);
       return;
@@ -192,7 +192,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
     // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
     // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
 #if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(t0, t1);
+    overflow = check_epi16_overflow_x2(&t0, &t1);
     if (overflow) {
       vp9_highbd_fdct4x4_c(input, output, stride);
       return;
@@ -231,7 +231,7 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
       const __m128i x0 = _mm_packs_epi32(w0, w1);
       const __m128i x1 = _mm_packs_epi32(w2, w3);
 #if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(x0, x1);
+      overflow = check_epi16_overflow_x2(&x0, &x1);
       if (overflow) {
         vp9_highbd_fdct4x4_c(input, output, stride);
         return;
@@ -254,8 +254,8 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   // Post-condition (v + 1) >> 2 is now incorporated into previous
   // add and right-shift commands.  Only 2 store instructions needed
   // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(in0, output + 0 * 4);
-  storeu_output(in1, output + 2 * 4);
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
 }
 
 
@@ -314,7 +314,8 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
     const __m128i q7 = SUB_EPI16(in0, in7);
 #if DCT_HIGH_BIT_DEPTH
     if (pass == 1) {
-      overflow = check_epi16_overflow_x8(q0, q1, q2, q3, q4, q5, q6, q7);
+      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                         &q4, &q5, &q6, &q7);
       if (overflow) {
         vp9_highbd_fdct8x8_c(input, output, stride);
         return;
@@ -329,7 +330,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
       const __m128i r2 = SUB_EPI16(q1, q2);
       const __m128i r3 = SUB_EPI16(q0, q3);
 #if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x4(r0, r1, r2, r3);
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
       if (overflow) {
         vp9_highbd_fdct8x8_c(input, output, stride);
         return;
@@ -372,7 +373,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
         res2 = _mm_packs_epi32(w4, w5);
         res6 = _mm_packs_epi32(w6, w7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(res0, res4, res2, res6);
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
         if (overflow) {
           vp9_highbd_fdct8x8_c(input, output, stride);
           return;
@@ -402,7 +403,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
       const __m128i r0 = _mm_packs_epi32(s0, s1);
       const __m128i r1 = _mm_packs_epi32(s2, s3);
 #if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(r0, r1);
+      overflow = check_epi16_overflow_x2(&r0, &r1);
       if (overflow) {
         vp9_highbd_fdct8x8_c(input, output, stride);
         return;
@@ -415,7 +416,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
         const __m128i x2 = SUB_EPI16(q7, r1);
         const __m128i x3 = ADD_EPI16(q7, r1);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(x0, x1, x2, x3);
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
         if (overflow) {
           vp9_highbd_fdct8x8_c(input, output, stride);
           return;
@@ -458,7 +459,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
           res5 = _mm_packs_epi32(w4, w5);
           res3 = _mm_packs_epi32(w6, w7);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(res1, res7, res5, res3);
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
           if (overflow) {
             vp9_highbd_fdct8x8_c(input, output, stride);
             return;
@@ -557,14 +558,14 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
     in6 = _mm_srai_epi16(in6, 1);
     in7 = _mm_srai_epi16(in7, 1);
     // store results
-    store_output(in0, (output + 0 * 8));
-    store_output(in1, (output + 1 * 8));
-    store_output(in2, (output + 2 * 8));
-    store_output(in3, (output + 3 * 8));
-    store_output(in4, (output + 4 * 8));
-    store_output(in5, (output + 5 * 8));
-    store_output(in6, (output + 6 * 8));
-    store_output(in7, (output + 7 * 8));
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
   }
 }
 
@@ -720,8 +721,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         input6 = ADD_EPI16(in06, in09);
         input7 = ADD_EPI16(in07, in08);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(input0, input1, input2, input3,
-                           input4, input5, input6, input7);
+        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+                                           &input4, &input5, &input6, &input7);
         if (overflow) {
           vp9_highbd_fdct16x16_c(input, output, stride);
           return;
@@ -739,8 +740,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         step1_6 = SUB_EPI16(in01, in14);
         step1_7 = SUB_EPI16(in00, in15);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(step1_0, step1_1, step1_2, step1_3,
-                           step1_4, step1_5, step1_6, step1_7);
+        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                           &step1_2, &step1_3,
+                                           &step1_4, &step1_5,
+                                           &step1_6, &step1_7);
         if (overflow) {
           vp9_highbd_fdct16x16_c(input, output, stride);
           return;
@@ -759,7 +762,8 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         const __m128i q6 = SUB_EPI16(input1, input6);
         const __m128i q7 = SUB_EPI16(input0, input7);
 #if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(q0, q1, q2, q3, q4, q5, q6, q7);
+        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                           &q4, &q5, &q6, &q7);
         if (overflow) {
           vp9_highbd_fdct16x16_c(input, output, stride);
           return;
@@ -773,7 +777,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           const __m128i r2 = SUB_EPI16(q1, q2);
           const __m128i r3 = SUB_EPI16(q0, q3);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(r0, r1, r2, r3);
+          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -786,16 +790,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
             const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
             const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
             const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(t0, t1, k__cospi_p16_p16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(t0, t1, k__cospi_p16_m16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(t2, t3, k__cospi_p24_p08,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(t2, t3, k__cospi_m08_p24,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(res00, res08, res04, res12);
+            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
             if (overflow) {
               vp9_highbd_fdct16x16_c(input, output, stride);
               return;
@@ -809,12 +813,14 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           // into 32 bits.
           const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
           const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 = mult_round_shift(d0, d1, k__cospi_p16_m16,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 = mult_round_shift(d0, d1, k__cospi_p16_p16,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(r0, r1);
+          overflow = check_epi16_overflow_x2(&r0, &r1);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -827,7 +833,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
             const __m128i x2 = SUB_EPI16(q7, r1);
             const __m128i x3 = ADD_EPI16(q7, r1);
 #if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(x0, x1, x2, x3);
+            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
             if (overflow) {
               vp9_highbd_fdct16x16_c(input, output, stride);
               return;
@@ -840,16 +846,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
               const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
               const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
               const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(t0, t1, k__cospi_p28_p04,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(t0, t1, k__cospi_m04_p28,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(t2, t3, k__cospi_p12_p20,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(t2, t3, k__cospi_m20_p12,
-                                k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-              overflow = check_epi16_overflow_x4(res02, res14, res10, res06);
+              overflow = check_epi16_overflow_x4(&res02, &res14,
+                                                 &res10, &res06);
               if (overflow) {
                 vp9_highbd_fdct16x16_c(input, output, stride);
                 return;
@@ -867,17 +874,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
           const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
           const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(t0, t1, k__cospi_p16_m16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(t2, t3, k__cospi_p16_m16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(t0, t1, k__cospi_p16_p16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(t2, t3, k__cospi_p16_p16,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(step2_2, step2_3, step2_5,
-                                             step2_4);
+          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+                                             &step2_4);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -895,8 +902,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step3_6 = ADD_EPI16(step1_6, step2_5);
           step3_7 = ADD_EPI16(step1_7, step2_4);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step3_0, step3_1, step3_2, step3_3,
-                             step3_4, step3_5, step3_6, step3_7);
+          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+                                             &step3_2, &step3_3,
+                                             &step3_4, &step3_5,
+                                             &step3_6, &step3_7);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -909,17 +918,17 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
           const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(t0, t1, k__cospi_m08_p24,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(t2, t3, k__cospi_p24_p08,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(t0, t1, k__cospi_p24_p08,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(t2, t3, k__cospi_p08_m24,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(step2_1, step2_2, step2_6,
-                                             step2_5);
+          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+                                             &step2_5);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -937,8 +946,10 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           step1_6 = SUB_EPI16(step3_7, step2_6);
           step1_7 = ADD_EPI16(step3_7, step2_6);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(step1_0, step1_1, step1_2, step1_3,
-                             step1_4, step1_5, step1_6, step1_7);
+          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                             &step1_2, &step1_3,
+                                             &step1_4, &step1_5,
+                                             &step1_6, &step1_7);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -951,16 +962,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
           const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
           const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(t0, t1, k__cospi_p30_p02,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(t2, t3, k__cospi_p14_p18,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(t0, t1, k__cospi_m02_p30,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(t2, t3, k__cospi_m18_p14,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(res01, res09, res15, res07);
+          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -972,16 +983,16 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
           const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
           const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
           const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(t0, t1, k__cospi_p22_p10,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(t2, t3, k__cospi_p06_p26,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(t0, t1, k__cospi_m10_p22,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(t2, t3, k__cospi_m26_p06,
-                                     k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
 #if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(res05, res13, res11, res03);
+          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
           if (overflow) {
             vp9_highbd_fdct16x16_c(input, output, stride);
             return;
@@ -990,11 +1001,11 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
         }
       }
       // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(res00, res01, res02, res03,
-                              res04, res05, res06, res07,
+      transpose_and_output8x8(&res00, &res01, &res02, &res03,
+                              &res04, &res05, &res06, &res07,
                               pass, out0, out1);
-      transpose_and_output8x8(res08, res09, res10, res11,
-                              res12, res13, res14, res15,
+      transpose_and_output8x8(&res08, &res09, &res10, &res11,
+                              &res12, &res13, &res14, &res15,
                               pass, out0 + 8, out1 + 8);
       if (pass == 0) {
         out0 += 8*16;
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 81da34306..e671f3998 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -40,7 +40,7 @@ void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
 
   in1 = _mm_add_epi32(tmp, in0);
   in0 = _mm_slli_epi32(in1, 1);
-  store_output(in0, output);
+  store_output(&in0, output);
 }
 
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
@@ -72,8 +72,8 @@ static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
   __m128i out23 = _mm_add_epi16(in23, kOne);
   out01 = _mm_srai_epi16(out01, 2);
   out23 = _mm_srai_epi16(out23, 2);
-  store_output(out01, (output + 0 * 8));
-  store_output(out23, (output + 1 * 8));
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
 }
 
 static INLINE void transpose_4x4(__m128i *res) {
@@ -245,7 +245,7 @@ void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   in0 = _mm_srli_si128(sum, 8);
 
   in1 = _mm_add_epi32(sum, in0);
-  store_output(in1, output);
+  store_output(&in1, output);
 }
 
 void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
@@ -759,14 +759,14 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
 // write 8x8 array
 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
                                     int stride) {
-  store_output(res[0], (output + 0 * stride));
-  store_output(res[1], (output + 1 * stride));
-  store_output(res[2], (output + 2 * stride));
-  store_output(res[3], (output + 3 * stride));
-  store_output(res[4], (output + 4 * stride));
-  store_output(res[5], (output + 5 * stride));
-  store_output(res[6], (output + 6 * stride));
-  store_output(res[7], (output + 7 * stride));
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
 }
 
 // perform in-place transpose
@@ -1292,7 +1292,7 @@ void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 1);
-  store_output(in1, output);
+  store_output(&in1, output);
 }
 
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
@@ -2251,7 +2251,7 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 3);
-  store_output(in1, output);
+  store_output(&in1, output);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/x86/vp9_dct_sse2.h b/vp9/encoder/x86/vp9_dct_sse2.h
index 2d322103e..b99db923e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.h
+++ b/vp9/encoder/x86/vp9_dct_sse2.h
@@ -43,99 +43,144 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
   return _mm_unpacklo_epi64(buf0, buf1);
 }
 
-static INLINE int check_epi16_overflow_x2(__m128i reg0, __m128i reg1) {
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
   const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
-                              _mm_cmpeq_epi16(reg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
-                              _mm_cmpeq_epi16(reg1, min_overflow));
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
   cmp0 = _mm_or_si128(cmp0, cmp1);
   return _mm_movemask_epi8(cmp0);
 }
 
-static INLINE int check_epi16_overflow_x4(__m128i reg0, __m128i reg1,
-                                          __m128i reg2, __m128i reg3) {
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
   const __m128i max_overflow = _mm_set1_epi16(0x7fff);
   const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(reg0, max_overflow),
-                              _mm_cmpeq_epi16(reg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(reg1, max_overflow),
-                              _mm_cmpeq_epi16(reg1, min_overflow));
-  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(reg2, max_overflow),
-                              _mm_cmpeq_epi16(reg2, min_overflow));
-  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(reg3, max_overflow),
-                              _mm_cmpeq_epi16(reg3, min_overflow));
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
   cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
   return _mm_movemask_epi8(cmp0);
 }
 
-static INLINE int check_epi16_overflow_x8(__m128i reg0, __m128i reg1,
-                                          __m128i reg2, __m128i reg3,
-                                          __m128i reg4, __m128i reg5,
-                                          __m128i reg6, __m128i reg7) {
+static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3,
+                                          const __m128i *preg4,
+                                          const __m128i *preg5,
+                                          const __m128i *preg6,
+                                          const __m128i *preg7) {
   int res0, res1;
-  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
-  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x12(__m128i reg0, __m128i reg1,
-                                   __m128i reg2, __m128i reg3, __m128i reg4,
-                                   __m128i reg5, __m128i reg6, __m128i reg7,
-                                   __m128i reg8, __m128i reg9, __m128i reg10,
-                                   __m128i reg11) {
+static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11) {
   int res0, res1;
-  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
-  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   if (!res0)
-    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x16(__m128i reg0,  __m128i reg1,
-                                    __m128i reg2, __m128i reg3,  __m128i reg4,
-                                    __m128i reg5, __m128i reg6,  __m128i reg7,
-                                    __m128i reg8, __m128i reg9,  __m128i reg10,
-                                    __m128i reg11, __m128i reg12, __m128i reg13,
-                                    __m128i reg14, __m128i reg15) {
+static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15) {
   int res0, res1;
-  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
-  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   if (!res0) {
-    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
     if (!res1)
-      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
   }
   return res0 + res1;
 }
 
-static INLINE int check_epi16_overflow_x32(__m128i reg0,  __m128i reg1,
-                                __m128i reg2, __m128i reg3,  __m128i reg4,
-                                __m128i reg5, __m128i reg6,  __m128i reg7,
-                                __m128i reg8, __m128i reg9,  __m128i reg10,
-                                __m128i reg11, __m128i reg12, __m128i reg13,
-                                __m128i reg14, __m128i reg15, __m128i reg16,
-                                __m128i reg17, __m128i reg18, __m128i reg19,
-                                __m128i reg20, __m128i reg21, __m128i reg22,
-                                __m128i reg23, __m128i reg24, __m128i reg25,
-                                __m128i reg26, __m128i reg27, __m128i reg28,
-                                __m128i reg29, __m128i reg30, __m128i reg31) {
+static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15,
+                                           const __m128i *preg16,
+                                           const __m128i *preg17,
+                                           const __m128i *preg18,
+                                           const __m128i *preg19,
+                                           const __m128i *preg20,
+                                           const __m128i *preg21,
+                                           const __m128i *preg22,
+                                           const __m128i *preg23,
+                                           const __m128i *preg24,
+                                           const __m128i *preg25,
+                                           const __m128i *preg26,
+                                           const __m128i *preg27,
+                                           const __m128i *preg28,
+                                           const __m128i *preg29,
+                                           const __m128i *preg30,
+                                           const __m128i *preg31) {
   int res0, res1;
-  res0 = check_epi16_overflow_x4(reg0, reg1, reg2, reg3);
-  res1 = check_epi16_overflow_x4(reg4, reg5, reg6, reg7);
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
   if (!res0) {
-    res0 = check_epi16_overflow_x4(reg8, reg9, reg10, reg11);
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
     if (!res1) {
-      res1 = check_epi16_overflow_x4(reg12, reg13, reg14, reg15);
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
       if (!res0) {
-        res0 = check_epi16_overflow_x4(reg16, reg17, reg18, reg19);
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
         if (!res1) {
-          res1 = check_epi16_overflow_x4(reg20, reg21, reg22, reg23);
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
           if (!res0) {
-            res0 = check_epi16_overflow_x4(reg24, reg25, reg26, reg27);
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
             if (!res1)
-              res1 = check_epi16_overflow_x4(reg28, reg29, reg30, reg31);
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
           }
         }
       }
@@ -144,14 +189,17 @@ static INLINE int check_epi16_overflow_x32(__m128i reg0,  __m128i reg1,
   return res0 + res1;
 }
 
-static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
-                 __m128i reg2, __m128i reg3, const __m128i* zero) {
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *zero) {
   __m128i minus_one = _mm_set1_epi32(-1);
   // Check for overflows
-  __m128i reg0_shifted = _mm_slli_epi64(reg0, 1);
-  __m128i reg1_shifted = _mm_slli_epi64(reg1, 1);
-  __m128i reg2_shifted = _mm_slli_epi64(reg2, 1);
-  __m128i reg3_shifted = _mm_slli_epi64(reg3, 1);
+  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
   __m128i reg0_top_dwords = _mm_shuffle_epi32(
       reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
   __m128i reg1_top_dwords = _mm_shuffle_epi32(
@@ -173,65 +221,107 @@ static INLINE int k_check_epi32_overflow_4(__m128i reg0, __m128i reg1,
   return (overflow_01 + overflow_23);
 }
 
-static INLINE int k_check_epi32_overflow_8(__m128i reg0, __m128i reg1,
-                                           __m128i reg2, __m128i reg3,
-                                           __m128i reg4, __m128i reg5,
-                                           __m128i reg6, __m128i reg7,
-                                           const __m128i* zero) {
-  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
-    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
   }
   return overflow;
 }
 
-static INLINE int k_check_epi32_overflow_16(
-    __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
-    __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
-    __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
-    __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
-    const __m128i* zero) {
-  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
-    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
     if (!overflow) {
-      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
+                                          zero);
       if (!overflow) {
-        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
       }
     }
   }
   return overflow;
 }
 
-static INLINE int k_check_epi32_overflow_32(
-    __m128i reg0, __m128i reg1, __m128i reg2, __m128i reg3,
-    __m128i reg4, __m128i reg5, __m128i reg6, __m128i reg7,
-    __m128i reg8, __m128i reg9, __m128i reg10, __m128i reg11,
-    __m128i reg12, __m128i reg13, __m128i reg14, __m128i reg15,
-    __m128i reg16, __m128i reg17, __m128i reg18, __m128i reg19,
-    __m128i reg20, __m128i reg21, __m128i reg22, __m128i reg23,
-    __m128i reg24, __m128i reg25, __m128i reg26, __m128i reg27,
-    __m128i reg28, __m128i reg29, __m128i reg30, __m128i reg31,
-    const __m128i* zero) {
-  int overflow = k_check_epi32_overflow_4(reg0, reg1, reg2, reg3, zero);
+static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *preg16,
+                                            const __m128i *preg17,
+                                            const __m128i *preg18,
+                                            const __m128i *preg19,
+                                            const __m128i *preg20,
+                                            const __m128i *preg21,
+                                            const __m128i *preg22,
+                                            const __m128i *preg23,
+                                            const __m128i *preg24,
+                                            const __m128i *preg25,
+                                            const __m128i *preg26,
+                                            const __m128i *preg27,
+                                            const __m128i *preg28,
+                                            const __m128i *preg29,
+                                            const __m128i *preg30,
+                                            const __m128i *preg31,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
   if (!overflow) {
-    overflow = k_check_epi32_overflow_4(reg4, reg5, reg6, reg7, zero);
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
     if (!overflow) {
-      overflow = k_check_epi32_overflow_4(reg8, reg9, reg10, reg11, zero);
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
       if (!overflow) {
-        overflow = k_check_epi32_overflow_4(reg12, reg13, reg14, reg15, zero);
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
         if (!overflow) {
-          overflow = k_check_epi32_overflow_4(reg16, reg17, reg18, reg19, zero);
+          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
+                                              zero);
           if (!overflow) {
-            overflow = k_check_epi32_overflow_4(reg20, reg21,
-                                                reg22, reg23, zero);
+            overflow = k_check_epi32_overflow_4(preg20, preg21,
+                                                preg22, preg23, zero);
             if (!overflow) {
-              overflow = k_check_epi32_overflow_4(reg24, reg25,
-                                                  reg26, reg27, zero);
+              overflow = k_check_epi32_overflow_4(preg24, preg25,
+                                                  preg26, preg27, zero);
               if (!overflow) {
-                overflow = k_check_epi32_overflow_4(reg28, reg29,
-                                                    reg30, reg31, zero);
+                overflow = k_check_epi32_overflow_4(preg28, preg29,
+                                                    preg30, preg31, zero);
               }
             }
           }
@@ -242,51 +332,52 @@ static INLINE int k_check_epi32_overflow_32(
   return overflow;
 }
 
-static INLINE void store_output(const __m128i output, tran_low_t* dst_ptr) {
+static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
-  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
   _mm_store_si128((__m128i *)(dst_ptr), out0);
   _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
 #else
-  _mm_store_si128((__m128i *)(dst_ptr), output);
+  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static INLINE void storeu_output(const __m128i output, tran_low_t* dst_ptr) {
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(output, zero);
-  __m128i out0 = _mm_unpacklo_epi16(output, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(output, sign_bits);
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
   _mm_storeu_si128((__m128i *)(dst_ptr), out0);
   _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
 #else
-  _mm_storeu_si128((__m128i *)(dst_ptr), output);
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 
-static INLINE __m128i mult_round_shift(const __m128i in0, const __m128i in1,
-                                       const __m128i multiplier,
-                                       const __m128i rounding,
+static INLINE __m128i mult_round_shift(const __m128i *pin0,
+                                       const __m128i *pin1,
+                                       const __m128i *pmultiplier,
+                                       const __m128i *prounding,
                                        const int shift) {
-  const __m128i u0 = _mm_madd_epi16(in0, multiplier);
-  const __m128i u1 = _mm_madd_epi16(in1, multiplier);
-  const __m128i v0 = _mm_add_epi32(u0, rounding);
-  const __m128i v1 = _mm_add_epi32(u1, rounding);
+  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+  const __m128i v0 = _mm_add_epi32(u0, *prounding);
+  const __m128i v1 = _mm_add_epi32(u1, *prounding);
   const __m128i w0 = _mm_srai_epi32(v0, shift);
   const __m128i w1 = _mm_srai_epi32(v1, shift);
   return _mm_packs_epi32(w0, w1);
 }
 
 static INLINE void transpose_and_output8x8(
-    const __m128i in00, const __m128i in01,
-    const __m128i in02, const __m128i in03,
-    const __m128i in04, const __m128i in05,
-    const __m128i in06, const __m128i in07,
+    const __m128i *pin00, const __m128i *pin01,
+    const __m128i *pin02, const __m128i *pin03,
+    const __m128i *pin04, const __m128i *pin05,
+    const __m128i *pin06, const __m128i *pin07,
     const int pass, int16_t* out0_ptr,
     tran_low_t* out1_ptr) {
   // 00 01 02 03 04 05 06 07
@@ -297,14 +388,14 @@ static INLINE void transpose_and_output8x8(
   // 50 51 52 53 54 55 56 57
   // 60 61 62 63 64 65 66 67
   // 70 71 72 73 74 75 76 77
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in00, in01);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in02, in03);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in00, in01);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in02, in03);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in04, in05);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in06, in07);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in04, in05);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in06, in07);
+  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
   // 00 10 01 11 02 12 03 13
   // 20 30 21 31 22 32 23 33
   // 04 14 05 15 06 16 07 17
@@ -355,14 +446,14 @@ static INLINE void transpose_and_output8x8(
     _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
     _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
   } else {
-    storeu_output(tr2_0, (out1_ptr + 0 * 16));
-    storeu_output(tr2_1, (out1_ptr + 1 * 16));
-    storeu_output(tr2_2, (out1_ptr + 2 * 16));
-    storeu_output(tr2_3, (out1_ptr + 3 * 16));
-    storeu_output(tr2_4, (out1_ptr + 4 * 16));
-    storeu_output(tr2_5, (out1_ptr + 5 * 16));
-    storeu_output(tr2_6, (out1_ptr + 6 * 16));
-    storeu_output(tr2_7, (out1_ptr + 7 * 16));
+    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
   }
 }