diff options
-rw-r--r-- | test/datarate_test.cc | 44 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 16 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 131 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_ssse3.c | 98 |
5 files changed, 161 insertions, 129 deletions
diff --git a/test/datarate_test.cc b/test/datarate_test.cc index e339030f9..014175981 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -561,6 +561,7 @@ class DatarateTestVP9Large } encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); if (cfg_.ts_number_layers > 1) { if (video->frame() == 0) { @@ -988,7 +989,7 @@ TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) { } // Check basic datarate targeting, for a single bitrate, when denoiser is on, -// for clip with high noise level. +// for clip with high noise level. Use 2 threads. TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; @@ -998,11 +999,12 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { cfg_.rc_max_quantizer = 56; cfg_.rc_end_usage = VPX_CBR; cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 2; ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), + // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), // but may add more modes in the future. cfg_.rc_target_bitrate = 1000; ResetModel(); @@ -1015,6 +1017,35 @@ TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { << " The datarate for the file is greater than target by too much!"; } +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for 1280x720 clip with 4 threads. +TEST_P(DatarateTestVP9LargeDenoiser, 4threads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 4; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.28) + << " The datarate for the file is greater than target by too much!"; +} + // Check basic datarate targeting, for a single bitrate, when denoiser is off // and on. TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) { @@ -1228,7 +1259,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) { } // Check basic rate targeting for 1 pass CBR SVC with denoising. -// 2 spatial layers and 3 temporal layer. Run CIF clip with 1 thread. +// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; @@ -1243,7 +1274,7 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) { cfg_.ts_rate_decimator[1] = 2; cfg_.ts_rate_decimator[2] = 1; cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; + cfg_.g_threads = 2; cfg_.temporal_layering_mode = 3; svc_params_.scaling_factor_num[0] = 144; svc_params_.scaling_factor_den[0] = 288; @@ -1251,11 +1282,10 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersDenoiserOn) { svc_params_.scaling_factor_den[1] = 288; cfg_.rc_dropframe_thresh = 10; cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. - for (int i = 200; i <= 800; i += 200) { + for (int i = 600; i <= 1000; i += 200) { cfg_.rc_target_bitrate = i; ResetModel(); denoiser_on_ = 1; diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index c995a9dff..592a68274 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -314,6 +314,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); struct buf_2d src = mb->plane[0].src; int is_skin = 0; + int increase_denoising = 0; int consec_zeromv = 0; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; @@ -356,22 +357,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv, motion_level); } - if (!is_skin && denoiser->denoising_level == kDenHigh) { - denoiser->increase_denoising = 1; - } else { - denoiser->increase_denoising = 0; - } + if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; if (denoiser->denoising_level >= kDenLow) decision = perform_motion_compensation( - denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx, + denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, cpi->svc.number_spatial_layers, cpi->Source->y_width); if (decision == FILTER_BLOCK) { - decision = vp9_denoiser_filter( - src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start, - avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude); + decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { @@ -533,7 +530,6 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, #ifdef OUTPUT_YUV_DENOISED make_grayscale(&denoiser->running_avg_y[i]); #endif - denoiser->increase_denoising = 0; denoiser->frame_buffer_initialized = 1; denoiser->denoising_level = kDenLow; denoiser->prev_denoising_level = kDenLow; diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index ce9a5966f..9bded2176 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -38,7 +38,6 @@ typedef struct vp9_denoiser { YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG mc_running_avg_y; YV12_BUFFER_CONFIG last_source; - int increase_denoising; int frame_buffer_initialized; int reset; VP9_DENOISER_LEVEL denoising_level; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 4b11041b2..103680fd5 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -2725,6 +2725,74 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, } #endif +// Calculate the score used in machine-learning based partition search early +// termination. +static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const double *clf; + const double *mean; + const double *sd; + const int mag_mv = + abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); + const int left_in_image = !!xd->left_mi; + const int above_in_image = !!xd->above_mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row]; + int above_par = 0; // above_partitioning + int left_par = 0; // left_partitioning + int last_par = 0; // last_partitioning + BLOCK_SIZE context_size; + double score; + int offset = 0; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + + if (above_in_image) { + context_size = xd->above_mi->sb_type; + if (context_size < bsize) + above_par = 2; + else if (context_size == bsize) + above_par = 1; + } + + if (left_in_image) { + context_size = xd->left_mi->sb_type; + if (context_size < bsize) + left_par = 2; + else if (context_size == bsize) + left_par = 1; + } + + if (prev_mi) { + context_size = prev_mi[0]->sb_type; + if (context_size < bsize) + last_par = 2; + else if (context_size == bsize) + last_par = 1; + } + + if (bsize == BLOCK_64X64) + offset = 0; + else if (bsize == BLOCK_32X32) + offset = 8; + else if (bsize == BLOCK_16X16) + offset = 16; + + // early termination score calculation + clf = &classifiers[offset]; + mean = &train_mean[offset]; + sd = &train_stdm[offset]; + score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) + + clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) + + clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) + + clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) + + clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) + + clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) + + clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7]; + return score; +} + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. @@ -2924,68 +2992,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (!x->e_mbd.lossless && !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { - const double *clf; - const double *mean; - const double *sd; - const int mag_mv = - abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); - const int left_in_image = !!xd->left_mi; - const int above_in_image = !!xd->above_mi; - MODE_INFO **prev_mi = - &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row]; - int above_par = 0; // above_partitioning - int left_par = 0; // left_partitioning - int last_par = 0; // last_partitioning - BLOCK_SIZE context_size; - double score; - int offset = 0; - - assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); - - if (above_in_image) { - context_size = xd->above_mi->sb_type; - if (context_size < bsize) - above_par = 2; - else if (context_size == bsize) - above_par = 1; - } - - if (left_in_image) { - context_size = xd->left_mi->sb_type; - if (context_size < bsize) - left_par = 2; - else if (context_size == bsize) - left_par = 1; - } - - if (prev_mi) { - context_size = prev_mi[0]->sb_type; - if (context_size < bsize) - last_par = 2; - else if (context_size == bsize) - last_par = 1; - } - - if (bsize == BLOCK_64X64) - offset = 0; - else if (bsize == BLOCK_32X32) - offset = 8; - else if (bsize == BLOCK_16X16) - offset = 16; - - // early termination score calculation - clf = &classifiers[offset]; - mean = &train_mean[offset]; - sd = &train_stdm[offset]; - score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) + - clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) + - clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) + - clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * - sd[3]) + - clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) + - clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) + - clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7]; - if (score < 0) { + if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) { do_split = 0; do_rect = 0; } diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 2f4e5ab85..4d2d95787 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -420,14 +420,14 @@ static INLINE void butterfly(const __m128i *x0, const __m128i *x1, *y1 = _mm_packs_epi32(tmp2, tmp3); } -static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i c0, - const __m128i c1) { +static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, + const __m128i *c1) { __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); u0 = _mm_unpacklo_epi16(*x0, *x1); u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, c0, c1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); *x0 = _mm_packs_epi32(tmp0, tmp1); *x1 = _mm_packs_epi32(tmp2, tmp3); } @@ -480,8 +480,8 @@ static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { v13 = _mm_sub_epi16(u5, u7); v14 = _mm_add_epi16(u5, u7); - butterfly_self(&v10, &v13, stg6_0, stg4_0); - butterfly_self(&v11, &v12, stg6_0, stg4_0); + butterfly_self(&v10, &v13, &stg6_0, &stg4_0); + butterfly_self(&v11, &v12, &stg6_0, &stg4_0); // 1, 14 x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 @@ -580,39 +580,39 @@ static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { u29 = _mm_sub_epi16(v30, v29); u30 = _mm_add_epi16(v29, v30); - butterfly_self(&u18, &u29, stg4_4, stg4_5); - butterfly_self(&u19, &u28, stg4_4, stg4_5); - butterfly_self(&u20, &u27, stg4_6, stg4_4); - butterfly_self(&u21, &u26, stg4_6, stg4_4); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); stp1[16] = _mm_add_epi16(u16, u23); - v23 = _mm_sub_epi16(u16, u23); + stp1[23] = _mm_sub_epi16(u16, u23); stp1[17] = _mm_add_epi16(u17, u22); - v22 = _mm_sub_epi16(u17, u22); + stp1[22] = _mm_sub_epi16(u17, u22); stp1[18] = _mm_add_epi16(u18, u21); - v21 = _mm_sub_epi16(u18, u21); + stp1[21] = _mm_sub_epi16(u18, u21); stp1[19] = _mm_add_epi16(u19, u20); - v20 = _mm_sub_epi16(u19, u20); + stp1[20] = _mm_sub_epi16(u19, u20); - v24 = _mm_sub_epi16(u31, u24); + stp1[24] = _mm_sub_epi16(u31, u24); stp1[31] = _mm_add_epi16(u24, u31); - v25 = _mm_sub_epi16(u30, u25); + stp1[25] = _mm_sub_epi16(u30, u25); stp1[30] = _mm_add_epi16(u25, u30); - v26 = _mm_sub_epi16(u29, u26); + stp1[26] = _mm_sub_epi16(u29, u26); stp1[29] = _mm_add_epi16(u26, u29); - v27 = _mm_sub_epi16(u28, u27); + stp1[27] = _mm_sub_epi16(u28, u27); stp1[28] = _mm_add_epi16(u27, u28); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &stp1[20], &stp1[27]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &stp1[21], &stp1[26]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &stp1[22], &stp1[25]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &stp1[23], &stp1[24]); + butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); + butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); + butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); + butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); } // Only upper-left 8x8 has non-zero coeff @@ -774,8 +774,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, stg4_4, stg4_5); - butterfly_self(&v10, &v13, stg4_6, stg4_4); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); } out[0] = _mm_add_epi16(v8, v11); @@ -790,8 +790,8 @@ static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, { const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], stg6_0, stg4_0); - butterfly_self(&out[3], &out[4], stg6_0, stg4_0); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); } } @@ -882,10 +882,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, stg3_4, stg3_5); - butterfly_self(&v18, &v29, stg3_6, stg3_4); - butterfly_self(&v21, &v26, stg3_8, stg3_9); - butterfly_self(&v22, &v25, stg3_10, stg3_8); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); } u16 = _mm_add_epi16(v16, v19); @@ -910,10 +910,10 @@ static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, stg4_4, stg4_5); - butterfly_self(&u19, &u28, stg4_4, stg4_5); - butterfly_self(&u20, &u27, stg4_6, stg4_4); - butterfly_self(&u21, &u26, stg4_6, stg4_4); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); } out[0] = _mm_add_epi16(u16, u23); @@ -1069,8 +1069,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, stg4_4, stg4_5); - butterfly_self(&v10, &v13, stg4_6, stg4_4); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); } out[0] = _mm_add_epi16(v8, v11); @@ -1086,8 +1086,8 @@ static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, { const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], stg6_0, stg4_0); - butterfly_self(&out[3], &out[4], stg6_0, stg4_0); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); } } @@ -1208,10 +1208,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, stg3_4, stg3_5); - butterfly_self(&v18, &v29, stg3_6, stg3_4); - butterfly_self(&v21, &v26, stg3_8, stg3_9); - butterfly_self(&v22, &v25, stg3_10, stg3_8); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); } u16 = _mm_add_epi16(v16, v19); @@ -1237,10 +1237,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, stg4_4, stg4_5); - butterfly_self(&u19, &u28, stg4_4, stg4_5); - butterfly_self(&u20, &u27, stg4_6, stg4_4); - butterfly_self(&u21, &u26, stg4_6, stg4_4); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); } out[0] = _mm_add_epi16(u16, u23); @@ -1264,10 +1264,10 @@ static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, { const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], stg6_0, stg4_0); - butterfly_self(&out[5], &out[10], stg6_0, stg4_0); - butterfly_self(&out[6], &out[9], stg6_0, stg4_0); - butterfly_self(&out[7], &out[8], stg6_0, stg4_0); + butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); + butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); + butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); + butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); } } |