summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/datarate_test.cc44
-rw-r--r--vp9/encoder/vp9_denoiser.c16
-rw-r--r--vp9/encoder/vp9_denoiser.h1
-rw-r--r--vp9/encoder/vp9_encodeframe.c131
-rw-r--r--vpx_dsp/x86/inv_txfm_ssse3.c98
5 files changed, 161 insertions, 129 deletions
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index e339030f9..014175981 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -561,6 +561,7 @@ class DatarateTestVP9Large
}
encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+ encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
if (cfg_.ts_number_layers > 1) {
if (video->frame() == 0) {
@@ -988,7 +989,7 @@ TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
}
// Check basic datarate targeting, for a single bitrate, when denoiser is on,
-// for clip with high noise level.
+// for clip with high noise level. Use 2 threads.
TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
@@ -998,11 +999,12 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
cfg_.rc_max_quantizer = 56;
cfg_.rc_end_usage = VPX_CBR;
cfg_.g_lag_in_frames = 0;
+ cfg_.g_threads = 2;
::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
// For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
- // there is only one denoiser mode: denoiserYonly(which is 1),
+ // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
// but may add more modes in the future.
cfg_.rc_target_bitrate = 1000;
ResetModel();
@@ -1015,6 +1017,35 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
<< " The datarate for the file is greater than target by too much!";
}
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.g_threads = 4;
+
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+ // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+ // there is only one denoiser mode: denoiserYonly(which is 1),
+ // but may add more modes in the future.
+ cfg_.rc_target_bitrate = 1000;
+ ResetModel();
+ // Turn on the denoiser.
+ denoiser_on_ = 1;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.28)
+ << " The datarate for the file is greater than target by too much!";
+}
+
// Check basic datarate targeting, for a single bitrate, when denoiser is off
// and on.
TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
@@ -1228,7 +1259,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
}
// Check basic rate targeting for 1 pass CBR SVC with denoising.
-// 2 spatial layers and 3 temporal layer. Run CIF clip with 1 thread.
+// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
@@ -1243,7 +1274,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
cfg_.ts_rate_decimator[1] = 2;
cfg_.ts_rate_decimator[2] = 1;
cfg_.g_error_resilient = 1;
- cfg_.g_threads = 1;
+ cfg_.g_threads = 2;
cfg_.temporal_layering_mode = 3;
svc_params_.scaling_factor_num[0] = 144;
svc_params_.scaling_factor_den[0] = 288;
@@ -1251,11 +1282,10 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) {
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 10;
cfg_.kf_max_dist = 9999;
- ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
- 30, 1, 0, 200);
+ ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
// TODO(marpan): Check that effective_datarate for each layer hits the
// layer target_bitrate.
- for (int i = 200; i <= 800; i += 200) {
+ for (int i = 600; i <= 1000; i += 200) {
cfg_.rc_target_bitrate = i;
ResetModel();
denoiser_on_ = 1;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index c995a9dff..592a68274 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -314,6 +314,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
struct buf_2d src = mb->plane[0].src;
int is_skin = 0;
+ int increase_denoising = 0;
int consec_zeromv = 0;
mv_col = ctx->best_sse_mv.as_mv.col;
mv_row = ctx->best_sse_mv.as_mv.row;
@@ -356,22 +357,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv,
motion_level);
}
- if (!is_skin && denoiser->denoising_level == kDenHigh) {
- denoiser->increase_denoising = 1;
- } else {
- denoiser->increase_denoising = 0;
- }
+ if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
if (denoiser->denoising_level >= kDenLow)
decision = perform_motion_compensation(
- denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx,
+ denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
cpi->svc.number_spatial_layers, cpi->Source->y_width);
if (decision == FILTER_BLOCK) {
- decision = vp9_denoiser_filter(
- src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start,
- avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude);
+ decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
+ mc_avg.y_stride, avg_start, avg.y_stride,
+ increase_denoising, bs, motion_magnitude);
}
if (decision == FILTER_BLOCK) {
@@ -533,7 +530,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
#ifdef OUTPUT_YUV_DENOISED
make_grayscale(&denoiser->running_avg_y[i]);
#endif
- denoiser->increase_denoising = 0;
denoiser->frame_buffer_initialized = 1;
denoiser->denoising_level = kDenLow;
denoiser->prev_denoising_level = kDenLow;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index ce9a5966f..9bded2176 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -38,7 +38,6 @@ typedef struct vp9_denoiser {
YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG mc_running_avg_y;
YV12_BUFFER_CONFIG last_source;
- int increase_denoising;
int frame_buffer_initialized;
int reset;
VP9_DENOISER_LEVEL denoising_level;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4b11041b2..103680fd5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2725,6 +2725,74 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
}
#endif
+// Calculate the score used in machine-learning based partition search early
+// termination.
+static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const double *clf;
+ const double *mean;
+ const double *sd;
+ const int mag_mv =
+ abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+ const int left_in_image = !!xd->left_mi;
+ const int above_in_image = !!xd->above_mi;
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+ int above_par = 0; // above_partitioning
+ int left_par = 0; // left_partitioning
+ int last_par = 0; // last_partitioning
+ BLOCK_SIZE context_size;
+ double score;
+ int offset = 0;
+
+ assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+
+ if (above_in_image) {
+ context_size = xd->above_mi->sb_type;
+ if (context_size < bsize)
+ above_par = 2;
+ else if (context_size == bsize)
+ above_par = 1;
+ }
+
+ if (left_in_image) {
+ context_size = xd->left_mi->sb_type;
+ if (context_size < bsize)
+ left_par = 2;
+ else if (context_size == bsize)
+ left_par = 1;
+ }
+
+ if (prev_mi) {
+ context_size = prev_mi[0]->sb_type;
+ if (context_size < bsize)
+ last_par = 2;
+ else if (context_size == bsize)
+ last_par = 1;
+ }
+
+ if (bsize == BLOCK_64X64)
+ offset = 0;
+ else if (bsize == BLOCK_32X32)
+ offset = 8;
+ else if (bsize == BLOCK_16X16)
+ offset = 16;
+
+ // early termination score calculation
+ clf = &classifiers[offset];
+ mean = &train_mean[offset];
+ sd = &train_stdm[offset];
+ score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
+ clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
+ clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
+ clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) +
+ clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
+ clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
+ clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
+ return score;
+}
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
@@ -2924,68 +2992,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (!x->e_mbd.lossless &&
!segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
- const double *clf;
- const double *mean;
- const double *sd;
- const int mag_mv =
- abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
- const int left_in_image = !!xd->left_mi;
- const int above_in_image = !!xd->above_mi;
- MODE_INFO **prev_mi =
- &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
- int above_par = 0; // above_partitioning
- int left_par = 0; // left_partitioning
- int last_par = 0; // last_partitioning
- BLOCK_SIZE context_size;
- double score;
- int offset = 0;
-
- assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
-
- if (above_in_image) {
- context_size = xd->above_mi->sb_type;
- if (context_size < bsize)
- above_par = 2;
- else if (context_size == bsize)
- above_par = 1;
- }
-
- if (left_in_image) {
- context_size = xd->left_mi->sb_type;
- if (context_size < bsize)
- left_par = 2;
- else if (context_size == bsize)
- left_par = 1;
- }
-
- if (prev_mi) {
- context_size = prev_mi[0]->sb_type;
- if (context_size < bsize)
- last_par = 2;
- else if (context_size == bsize)
- last_par = 1;
- }
-
- if (bsize == BLOCK_64X64)
- offset = 0;
- else if (bsize == BLOCK_32X32)
- offset = 8;
- else if (bsize == BLOCK_16X16)
- offset = 16;
-
- // early termination score calculation
- clf = &classifiers[offset];
- mean = &train_mean[offset];
- sd = &train_stdm[offset];
- score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
- clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
- clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
- clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) *
- sd[3]) +
- clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
- clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
- clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
- if (score < 0) {
+ if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) {
do_split = 0;
do_rect = 0;
}
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c
index 2f4e5ab85..4d2d95787 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -420,14 +420,14 @@ static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
*y1 = _mm_packs_epi32(tmp2, tmp3);
}
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i c0,
- const __m128i c1) {
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+ const __m128i *c1) {
__m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
u0 = _mm_unpacklo_epi16(*x0, *x1);
u1 = _mm_unpackhi_epi16(*x0, *x1);
- BUTTERFLY_PAIR(u0, u1, c0, c1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
*x0 = _mm_packs_epi32(tmp0, tmp1);
*x1 = _mm_packs_epi32(tmp2, tmp3);
}
@@ -480,8 +480,8 @@ static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
v13 = _mm_sub_epi16(u5, u7);
v14 = _mm_add_epi16(u5, u7);
- butterfly_self(&v10, &v13, stg6_0, stg4_0);
- butterfly_self(&v11, &v12, stg6_0, stg4_0);
+ butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
+ butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
// 1, 14
x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0
@@ -580,39 +580,39 @@ static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
u29 = _mm_sub_epi16(v30, v29);
u30 = _mm_add_epi16(v29, v30);
- butterfly_self(&u18, &u29, stg4_4, stg4_5);
- butterfly_self(&u19, &u28, stg4_4, stg4_5);
- butterfly_self(&u20, &u27, stg4_6, stg4_4);
- butterfly_self(&u21, &u26, stg4_6, stg4_4);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
stp1[16] = _mm_add_epi16(u16, u23);
- v23 = _mm_sub_epi16(u16, u23);
+ stp1[23] = _mm_sub_epi16(u16, u23);
stp1[17] = _mm_add_epi16(u17, u22);
- v22 = _mm_sub_epi16(u17, u22);
+ stp1[22] = _mm_sub_epi16(u17, u22);
stp1[18] = _mm_add_epi16(u18, u21);
- v21 = _mm_sub_epi16(u18, u21);
+ stp1[21] = _mm_sub_epi16(u18, u21);
stp1[19] = _mm_add_epi16(u19, u20);
- v20 = _mm_sub_epi16(u19, u20);
+ stp1[20] = _mm_sub_epi16(u19, u20);
- v24 = _mm_sub_epi16(u31, u24);
+ stp1[24] = _mm_sub_epi16(u31, u24);
stp1[31] = _mm_add_epi16(u24, u31);
- v25 = _mm_sub_epi16(u30, u25);
+ stp1[25] = _mm_sub_epi16(u30, u25);
stp1[30] = _mm_add_epi16(u25, u30);
- v26 = _mm_sub_epi16(u29, u26);
+ stp1[26] = _mm_sub_epi16(u29, u26);
stp1[29] = _mm_add_epi16(u26, u29);
- v27 = _mm_sub_epi16(u28, u27);
+ stp1[27] = _mm_sub_epi16(u28, u27);
stp1[28] = _mm_add_epi16(u27, u28);
- butterfly(&v20, &v27, &stg6_0, &stg4_0, &stp1[20], &stp1[27]);
- butterfly(&v21, &v26, &stg6_0, &stg4_0, &stp1[21], &stp1[26]);
- butterfly(&v22, &v25, &stg6_0, &stg4_0, &stp1[22], &stp1[25]);
- butterfly(&v23, &v24, &stg6_0, &stg4_0, &stp1[23], &stp1[24]);
+ butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
+ butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
}
// Only upper-left 8x8 has non-zero coeff
@@ -774,8 +774,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&v9, &v14, stg4_4, stg4_5);
- butterfly_self(&v10, &v13, stg4_6, stg4_4);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
}
out[0] = _mm_add_epi16(v8, v11);
@@ -790,8 +790,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
{
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[2], &out[5], stg6_0, stg4_0);
- butterfly_self(&out[3], &out[4], stg6_0, stg4_0);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
}
}
@@ -882,10 +882,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- butterfly_self(&v17, &v30, stg3_4, stg3_5);
- butterfly_self(&v18, &v29, stg3_6, stg3_4);
- butterfly_self(&v21, &v26, stg3_8, stg3_9);
- butterfly_self(&v22, &v25, stg3_10, stg3_8);
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
}
u16 = _mm_add_epi16(v16, v19);
@@ -910,10 +910,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&u18, &u29, stg4_4, stg4_5);
- butterfly_self(&u19, &u28, stg4_4, stg4_5);
- butterfly_self(&u20, &u27, stg4_6, stg4_4);
- butterfly_self(&u21, &u26, stg4_6, stg4_4);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
}
out[0] = _mm_add_epi16(u16, u23);
@@ -1069,8 +1069,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&v9, &v14, stg4_4, stg4_5);
- butterfly_self(&v10, &v13, stg4_6, stg4_4);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
}
out[0] = _mm_add_epi16(v8, v11);
@@ -1086,8 +1086,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
{
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[2], &out[5], stg6_0, stg4_0);
- butterfly_self(&out[3], &out[4], stg6_0, stg4_0);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
}
}
@@ -1208,10 +1208,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- butterfly_self(&v17, &v30, stg3_4, stg3_5);
- butterfly_self(&v18, &v29, stg3_6, stg3_4);
- butterfly_self(&v21, &v26, stg3_8, stg3_9);
- butterfly_self(&v22, &v25, stg3_10, stg3_8);
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
}
u16 = _mm_add_epi16(v16, v19);
@@ -1237,10 +1237,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&u18, &u29, stg4_4, stg4_5);
- butterfly_self(&u19, &u28, stg4_4, stg4_5);
- butterfly_self(&u20, &u27, stg4_6, stg4_4);
- butterfly_self(&u21, &u26, stg4_6, stg4_4);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
}
out[0] = _mm_add_epi16(u16, u23);
@@ -1264,10 +1264,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
{
const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[4], &out[11], stg6_0, stg4_0);
- butterfly_self(&out[5], &out[10], stg6_0, stg4_0);
- butterfly_self(&out[6], &out[9], stg6_0, stg4_0);
- butterfly_self(&out[7], &out[8], stg6_0, stg4_0);
+ butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+ butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+ butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+ butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
}
}