5 files changed, 161 insertions, 129 deletions
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index e339030f9..014175981 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -561,6 +561,7 @@ class DatarateTestVP9Large
     }
 
     encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
 
     if (cfg_.ts_number_layers > 1) {
       if (video->frame() == 0) {
@@ -988,7 +989,7 @@ TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
 }
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is on,
-// for clip with high noise level.
+// for clip with high noise level. Use 2 threads.
 TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
@@ -998,11 +999,12 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
   cfg_.rc_max_quantizer = 56;
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 2;
 
   ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
 
   // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
   // but may add more modes in the future.
   cfg_.rc_target_bitrate = 1000;
   ResetModel();
@@ -1015,6 +1017,35 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
       << " The datarate for the file is greater than target by too much!";
 }
 
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 4;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.28)
+      << " The datarate for the file is greater than target by too much!";
+}
+
 // Check basic datarate targeting, for a single bitrate, when denoiser is off
 // and on.
 TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
@@ -1228,7 +1259,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
 }
 
 // Check basic rate targeting for 1 pass CBR SVC with denoising.
-// 2 spatial layers and 3 temporal layer. Run CIF clip with 1 thread.
+// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
 TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
@@ -1243,7 +1274,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
   cfg_.ts_rate_decimator[1] = 2;
   cfg_.ts_rate_decimator[2] = 1;
   cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
+  cfg_.g_threads = 2;
   cfg_.temporal_layering_mode = 3;
   svc_params_.scaling_factor_num[0] = 144;
   svc_params_.scaling_factor_den[0] = 288;
@@ -1251,11 +1282,10 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
   // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
-  for (int i = 200; i <= 800; i += 200) {
+  for (int i = 600; i <= 1000; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     denoiser_on_ = 1;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index c995a9dff..592a68274 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -314,6 +314,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
       block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
+  int increase_denoising = 0;
   int consec_zeromv = 0;
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
@@ -356,22 +357,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
         mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv,
         motion_level);
   }
-  if (!is_skin && denoiser->denoising_level == kDenHigh) {
-    denoiser->increase_denoising = 1;
-  } else {
-    denoiser->increase_denoising = 0;
-  }
+  if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
 
   if (denoiser->denoising_level >= kDenLow)
     decision = perform_motion_compensation(
-        denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx,
+        denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
         cpi->svc.number_spatial_layers, cpi->Source->y_width);
 
   if (decision == FILTER_BLOCK) {
-    decision = vp9_denoiser_filter(
-        src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start,
-        avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude);
+    decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
+                                   mc_avg.y_stride, avg_start, avg.y_stride,
+                                   increase_denoising, bs, motion_magnitude);
   }
 
   if (decision == FILTER_BLOCK) {
@@ -533,7 +530,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
 #ifdef OUTPUT_YUV_DENOISED
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
-  denoiser->increase_denoising = 0;
   denoiser->frame_buffer_initialized = 1;
   denoiser->denoising_level = kDenLow;
   denoiser->prev_denoising_level = kDenLow;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index ce9a5966f..9bded2176 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -38,7 +38,6 @@ typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
   YV12_BUFFER_CONFIG last_source;
-  int increase_denoising;
   int frame_buffer_initialized;
   int reset;
   VP9_DENOISER_LEVEL denoising_level;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4b11041b2..103680fd5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2725,6 +2725,74 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
+// Calculate the score used in machine-learning based partition search early
+// termination.
+static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                            PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  const double *clf;
+  const double *mean;
+  const double *sd;
+  const int mag_mv =
+      abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+  const int left_in_image = !!xd->left_mi;
+  const int above_in_image = !!xd->above_mi;
+  MODE_INFO **prev_mi =
+      &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+  int above_par = 0;  // above_partitioning
+  int left_par = 0;   // left_partitioning
+  int last_par = 0;   // last_partitioning
+  BLOCK_SIZE context_size;
+  double score;
+  int offset = 0;
+
+  assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+
+  if (above_in_image) {
+    context_size = xd->above_mi->sb_type;
+    if (context_size < bsize)
+      above_par = 2;
+    else if (context_size == bsize)
+      above_par = 1;
+  }
+
+  if (left_in_image) {
+    context_size = xd->left_mi->sb_type;
+    if (context_size < bsize)
+      left_par = 2;
+    else if (context_size == bsize)
+      left_par = 1;
+  }
+
+  if (prev_mi) {
+    context_size = prev_mi[0]->sb_type;
+    if (context_size < bsize)
+      last_par = 2;
+    else if (context_size == bsize)
+      last_par = 1;
+  }
+
+  if (bsize == BLOCK_64X64)
+    offset = 0;
+  else if (bsize == BLOCK_32X32)
+    offset = 8;
+  else if (bsize == BLOCK_16X16)
+    offset = 16;
+
+  // early termination score calculation
+  clf = &classifiers[offset];
+  mean = &train_mean[offset];
+  sd = &train_stdm[offset];
+  score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
+          clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
+          clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
+          clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) +
+          clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
+          clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
+          clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
+  return score;
+}
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2924,68 +2992,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
           if (!x->e_mbd.lossless &&
               !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
               ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
-            const double *clf;
-            const double *mean;
-            const double *sd;
-            const int mag_mv =
-                abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
-            const int left_in_image = !!xd->left_mi;
-            const int above_in_image = !!xd->above_mi;
-            MODE_INFO **prev_mi =
-                &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
-            int above_par = 0;  // above_partitioning
-            int left_par = 0;   // left_partitioning
-            int last_par = 0;   // last_partitioning
-            BLOCK_SIZE context_size;
-            double score;
-            int offset = 0;
-
-            assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
-
-            if (above_in_image) {
-              context_size = xd->above_mi->sb_type;
-              if (context_size < bsize)
-                above_par = 2;
-              else if (context_size == bsize)
-                above_par = 1;
-            }
-
-            if (left_in_image) {
-              context_size = xd->left_mi->sb_type;
-              if (context_size < bsize)
-                left_par = 2;
-              else if (context_size == bsize)
-                left_par = 1;
-            }
-
-            if (prev_mi) {
-              context_size = prev_mi[0]->sb_type;
-              if (context_size < bsize)
-                last_par = 2;
-              else if (context_size == bsize)
-                last_par = 1;
-            }
-
-            if (bsize == BLOCK_64X64)
-              offset = 0;
-            else if (bsize == BLOCK_32X32)
-              offset = 8;
-            else if (bsize == BLOCK_16X16)
-              offset = 16;
-
-            // early termination score calculation
-            clf = &classifiers[offset];
-            mean = &train_mean[offset];
-            sd = &train_stdm[offset];
-            score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
-                    clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
-                    clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
-                    clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) *
-                              sd[3]) +
-                    clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
-                    clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
-                    clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
-            if (score < 0) {
+            if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) {
               do_split = 0;
               do_rect = 0;
             }
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c
index 2f4e5ab85..4d2d95787 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -420,14 +420,14 @@ static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
   *y1 = _mm_packs_epi32(tmp2, tmp3);
 }
 
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i c0,
-                                  const __m128i c1) {
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+                                  const __m128i *c1) {
   __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   u0 = _mm_unpacklo_epi16(*x0, *x1);
   u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, c0, c1);
+  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
   *x0 = _mm_packs_epi32(tmp0, tmp1);
   *x1 = _mm_packs_epi32(tmp2, tmp3);
 }
@@ -480,8 +480,8 @@ static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
   v13 = _mm_sub_epi16(u5, u7);
   v14 = _mm_add_epi16(u5, u7);
 
-  butterfly_self(&v10, &v13, stg6_0, stg4_0);
-  butterfly_self(&v11, &v12, stg6_0, stg4_0);
+  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
+  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
 
   // 1, 14
   x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
@@ -580,39 +580,39 @@ static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
   u29 = _mm_sub_epi16(v30, v29);
   u30 = _mm_add_epi16(v29, v30);
 
-  butterfly_self(&u18, &u29, stg4_4, stg4_5);
-  butterfly_self(&u19, &u28, stg4_4, stg4_5);
-  butterfly_self(&u20, &u27, stg4_6, stg4_4);
-  butterfly_self(&u21, &u26, stg4_6, stg4_4);
+  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
 
   stp1[16] = _mm_add_epi16(u16, u23);
-  v23 = _mm_sub_epi16(u16, u23);
+  stp1[23] = _mm_sub_epi16(u16, u23);
 
   stp1[17] = _mm_add_epi16(u17, u22);
-  v22 = _mm_sub_epi16(u17, u22);
+  stp1[22] = _mm_sub_epi16(u17, u22);
 
   stp1[18] = _mm_add_epi16(u18, u21);
-  v21 = _mm_sub_epi16(u18, u21);
+  stp1[21] = _mm_sub_epi16(u18, u21);
 
   stp1[19] = _mm_add_epi16(u19, u20);
-  v20 = _mm_sub_epi16(u19, u20);
+  stp1[20] = _mm_sub_epi16(u19, u20);
 
-  v24 = _mm_sub_epi16(u31, u24);
+  stp1[24] = _mm_sub_epi16(u31, u24);
   stp1[31] = _mm_add_epi16(u24, u31);
 
-  v25 = _mm_sub_epi16(u30, u25);
+  stp1[25] = _mm_sub_epi16(u30, u25);
   stp1[30] = _mm_add_epi16(u25, u30);
 
-  v26 = _mm_sub_epi16(u29, u26);
+  stp1[26] = _mm_sub_epi16(u29, u26);
   stp1[29] = _mm_add_epi16(u26, u29);
 
-  v27 = _mm_sub_epi16(u28, u27);
+  stp1[27] = _mm_sub_epi16(u28, u27);
   stp1[28] = _mm_add_epi16(u27, u28);
 
-  butterfly(&v20, &v27, &stg6_0, &stg4_0, &stp1[20], &stp1[27]);
-  butterfly(&v21, &v26, &stg6_0, &stg4_0, &stp1[21], &stp1[26]);
-  butterfly(&v22, &v25, &stg6_0, &stg4_0, &stp1[22], &stp1[25]);
-  butterfly(&v23, &v24, &stg6_0, &stg4_0, &stp1[23], &stp1[24]);
+  butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
+  butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
+  butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
+  butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
 }
 
 // Only upper-left 8x8 has non-zero coeff
@@ -774,8 +774,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, stg4_4, stg4_5);
-    butterfly_self(&v10, &v13, stg4_6, stg4_4);
+    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
   }
 
   out[0] = _mm_add_epi16(v8, v11);
@@ -790,8 +790,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
   {
     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], stg6_0, stg4_0);
-    butterfly_self(&out[3], &out[4], stg6_0, stg4_0);
+    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
   }
 }
 
@@ -882,10 +882,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
     const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
     const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
 
-    butterfly_self(&v17, &v30, stg3_4, stg3_5);
-    butterfly_self(&v18, &v29, stg3_6, stg3_4);
-    butterfly_self(&v21, &v26, stg3_8, stg3_9);
-    butterfly_self(&v22, &v25, stg3_10, stg3_8);
+    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
   }
 
   u16 = _mm_add_epi16(v16, v19);
@@ -910,10 +910,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, stg4_4, stg4_5);
-    butterfly_self(&u19, &u28, stg4_4, stg4_5);
-    butterfly_self(&u20, &u27, stg4_6, stg4_4);
-    butterfly_self(&u21, &u26, stg4_6, stg4_4);
+    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
   }
 
   out[0] = _mm_add_epi16(u16, u23);
@@ -1069,8 +1069,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, stg4_4, stg4_5);
-    butterfly_self(&v10, &v13, stg4_6, stg4_4);
+    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
   }
 
   out[0] = _mm_add_epi16(v8, v11);
@@ -1086,8 +1086,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
   {
     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], stg6_0, stg4_0);
-    butterfly_self(&out[3], &out[4], stg6_0, stg4_0);
+    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
   }
 }
 
@@ -1208,10 +1208,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
     const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
     const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
     const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, stg3_4, stg3_5);
-    butterfly_self(&v18, &v29, stg3_6, stg3_4);
-    butterfly_self(&v21, &v26, stg3_8, stg3_9);
-    butterfly_self(&v22, &v25, stg3_10, stg3_8);
+    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
   }
 
   u16 = _mm_add_epi16(v16, v19);
@@ -1237,10 +1237,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
     const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
     const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
     const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, stg4_4, stg4_5);
-    butterfly_self(&u19, &u28, stg4_4, stg4_5);
-    butterfly_self(&u20, &u27, stg4_6, stg4_4);
-    butterfly_self(&u21, &u26, stg4_6, stg4_4);
+    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
   }
 
   out[0] = _mm_add_epi16(u16, u23);
@@ -1264,10 +1264,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
   {
     const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
     const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], stg6_0, stg4_0);
-    butterfly_self(&out[5], &out[10], stg6_0, stg4_0);
-    butterfly_self(&out[6], &out[9], stg6_0, stg4_0);
-    butterfly_self(&out[7], &out[8], stg6_0, stg4_0);
+    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
   }
 }