diff options
-rw-r--r-- | build/make/configure.sh | 22 | ||||
-rwxr-xr-x | configure | 19 | ||||
-rw-r--r-- | test/vp9_avg_test.cc | 96 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_avg_neon.c | 27 | ||||
-rw-r--r-- | vp9/encoder/vp9_avg.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 20 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_noise_estimate.c | 15 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 59 | ||||
-rw-r--r-- | vp9/encoder/vp9_svc_layercontext.c | 2 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 39 | ||||
-rw-r--r-- | vpx/src/svc_encodeframe.c | 6 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vpx_dsp/x86/intrapred_sse2.asm | 36 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 486 |
18 files changed, 748 insertions, 105 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index 37ed86f83..98248b0a7 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -1208,14 +1208,20 @@ EOF soft_enable runtime_cpu_detect # We can't use 'check_cflags' until the compiler is configured and CC is # populated. - check_gcc_machine_option mmx - check_gcc_machine_option sse - check_gcc_machine_option sse2 - check_gcc_machine_option sse3 - check_gcc_machine_option ssse3 - check_gcc_machine_option sse4 sse4_1 - check_gcc_machine_option avx - check_gcc_machine_option avx2 + for ext in ${ARCH_EXT_LIST_X86}; do + # disable higher order extensions to simplify asm dependencies + if [ "$disable_exts" = "yes" ]; then + if ! disabled $ext; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + disable_feature $ext + fi + elif disabled $ext; then + disable_exts="yes" + else + # use the shortened version for the flag: sse4_1 -> sse4 + check_gcc_machine_option ${ext%_*} $ext + fi + done if enabled external_build; then log_echo " skipping assembler detection" @@ -234,6 +234,16 @@ ARCH_LIST=" x86 x86_64 " +ARCH_EXT_LIST_X86=" + mmx + sse + sse2 + sse3 + ssse3 + sse4_1 + avx + avx2 +" ARCH_EXT_LIST=" edsp media @@ -245,14 +255,7 @@ ARCH_EXT_LIST=" msa mips64 - mmx - sse - sse2 - sse3 - ssse3 - sse4_1 - avx - avx2 + ${ARCH_EXT_LIST_X86} " HAVE_LIST=" ${ARCH_EXT_LIST} diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc index d38313116..290bdc75e 100644 --- a/test/vp9_avg_test.cc +++ b/test/vp9_avg_test.cc @@ -194,6 +194,48 @@ class IntProColTest int16_t sum_c_; }; +typedef int (*SatdFunc)(const int16_t *coeffs, int length); +typedef std::tr1::tuple<int, SatdFunc> SatdTestParam; + +class SatdTest + : public ::testing::Test, + public ::testing::WithParamInterface<SatdTestParam> { + protected: + virtual void SetUp() { + satd_size_ = GET_PARAM(0); + satd_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast<int16_t*>( + vpx_memalign(16, sizeof(*src_) * satd_size_)); + ASSERT_TRUE(src_ != NULL); + } + + virtual void TearDown() { + libvpx_test::ClearSystemState(); + vpx_free(src_); + } + + void FillConstant(const int16_t val) { + for (int i = 0; i < satd_size_; ++i) src_[i] = val; + } + + void FillRandom() { + for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16(); + } + + void Check(const int expected) { + int total; + ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_)); + EXPECT_EQ(expected, total); + } + + int satd_size_; + + private: + int16_t *src_; + SatdFunc satd_func_; + ACMRandom rnd_; +}; uint8_t* AverageTestBase::source_data_ = NULL; @@ -246,6 +288,36 @@ TEST_P(IntProColTest, Random) { RunComparison(); } + +TEST_P(SatdTest, MinValue) { + const int kMin = -32640; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdTest, MaxValue) { + const int kMax = 32640; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 205298; break; + case 64: expected = 1113950; break; + case 256: expected = 4268415; break; + case 1024: expected = 16954082; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + using std::tr1::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -254,6 +326,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c), make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c))); +INSTANTIATE_TEST_CASE_P( + C, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_c), + make_tuple(64, &vp9_satd_c), + make_tuple(256, &vp9_satd_c), + make_tuple(1024, &vp9_satd_c))); + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, AverageTest, @@ -276,6 +356,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c), make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c), make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c))); + +INSTANTIATE_TEST_CASE_P( + SSE2, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_sse2), + make_tuple(64, &vp9_satd_sse2), + make_tuple(256, &vp9_satd_sse2), + make_tuple(1024, &vp9_satd_sse2))); #endif #if HAVE_NEON @@ -297,6 +385,14 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c), make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c))); + +INSTANTIATE_TEST_CASE_P( + NEON, SatdTest, + ::testing::Values( + make_tuple(16, &vp9_satd_neon), + make_tuple(64, &vp9_satd_neon), + make_tuple(256, &vp9_satd_neon), + make_tuple(1024, &vp9_satd_neon))); #endif #if HAVE_MSA diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 890b63821..8fe6503aa 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -209,8 +209,8 @@ specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; specialize qw/vp9_hadamard_16x16 sse2/; -add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; -specialize qw/vp9_satd sse2/; +add_proto qw/int vp9_satd/, "const int16_t *coeff, int length"; +specialize qw/vp9_satd sse2 neon/; add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; specialize qw/vp9_int_pro_row sse2 neon/; diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c index d569ec95d..5996bd426 100644 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ b/vp9/encoder/arm/neon/vp9_avg_neon.c @@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { return (horizontal_add_u16x8(v_sum) + 32) >> 6; } +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int vp9_satd_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} + void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index a9a4c3050..7baa09ae5 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -117,14 +117,14 @@ void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int16_t vp9_satd_c(const int16_t *coeff, int length) { +int vp9_satd_c(const int16_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - return (int16_t)satd; + return satd; } // Integer projection onto row vectors. diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index e87a12e44..fc76c11c4 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -316,7 +316,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, - PICK_MODE_CONTEXT *ctx) { + PICK_MODE_CONTEXT *ctx, + VP9_DENOISER_DECISION *denoiser_decision) { int mv_col, mv_row; int motion_magnitude = 0; VP9_DENOISER_DECISION decision = COPY_BLOCK; @@ -380,6 +381,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } + *denoiser_decision = decision; } static void copy_frame(YV12_BUFFER_CONFIG * const dest, @@ -458,6 +460,7 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { ctx->zeromv_sse = UINT_MAX; ctx->newmv_sse = UINT_MAX; + ctx->zeromv_lastref_sse = UINT_MAX; } void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index bc676e925..c8c93528b 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -54,7 +54,8 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, - PICK_MODE_CONTEXT *ctx); + PICK_MODE_CONTEXT *ctx , + VP9_DENOISER_DECISION *denoiser_decision); void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f9c28f6a9..e9b62bbc5 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -843,7 +843,8 @@ static int choose_partitioning(VP9_COMP *cpi, force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; - } else if (vt.split[i].split[j].part_variances.none.variance > + } else if (cpi->oxcf.speed < 8 && + vt.split[i].split[j].part_variances.none.variance > thresholds[1] && !cyclic_refresh_segment_id_boosted(segment_id)) { // We have some nominal amount of 16x16 variance (based on average), @@ -1746,16 +1747,6 @@ static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, set_offsets(cpi, tile, x, mi_row, mi_col, bsize); update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize); -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && - output_enabled && - cpi->common.frame_type != KEY_FRAME && - cpi->resize_pending == 0) { - vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, - VPXMAX(BLOCK_8X8, bsize), ctx); - } -#endif - encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); update_stats(&cpi->common, td); @@ -2432,8 +2423,15 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (cpi->sf.use_square_partition_only && bsize > cpi->sf.use_square_only_threshold) { + if (cpi->use_svc) { + if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_horz_allowed &= force_horz_split; + if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) + partition_vert_allowed &= force_vert_split; + } else { partition_horz_allowed &= force_horz_split; partition_vert_allowed &= force_vert_split; + } } save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a57cf8725..72fa82835 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3682,12 +3682,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH - if (use_normative_scaler) + if (use_normative_scaler && + unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth); else scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); #else - if (use_normative_scaler) + if (use_normative_scaler && + unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) scale_and_extend_frame(unscaled, scaled); else scale_and_extend_frame_nonnormative(unscaled, scaled); diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c index b41ffd0a3..b26f6f217 100644 --- a/vp9/encoder/vp9_noise_estimate.c +++ b/vp9/encoder/vp9_noise_estimate.c @@ -25,7 +25,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; - ne->level = kLow; + ne->level = kLowLow; ne->value = 0; ne->count = 0; ne->thresh = 90; @@ -220,22 +220,25 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Reset counter and check noise level condition. ne->num_frames_estimate = 30; ne->count = 0; - if (ne->value > (ne->thresh << 1)) + if (ne->value > (ne->thresh << 1)) { ne->level = kHigh; - else + } else { if (ne->value > ne->thresh) ne->level = kMedium; else if (ne->value > (ne->thresh >> 1)) ne->level = kLow; else ne->level = kLowLow; + } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) + vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); +#endif } } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->oxcf.noise_sensitivity > 0) copy_frame(&cpi->denoiser.last_source, cpi->Source); - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); - } #endif } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 8aafae1d4..095847a23 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -673,7 +673,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, if (*eob == 1) *rate += (int)abs(qcoeff[0]); else if (*eob > 1) - *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4); + *rate += vp9_satd((const int16_t *)qcoeff, step << 4); *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift; } @@ -1143,6 +1143,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int best_pred_sad = INT_MAX; int best_early_term = 0; int ref_frame_cost[MAX_REF_FRAMES]; +#if CONFIG_VP9_TEMPORAL_DENOISING + int64_t zero_last_cost_orig = INT64_MAX; +#endif init_ref_frame_cost(cm, xd, ref_frame_cost); @@ -1524,8 +1527,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0) { vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx); + // Keep track of zero_last cost. + if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) + zero_last_cost_orig = this_rdc.rdcost; + } #else (void)ctx; #endif @@ -1683,6 +1690,54 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && + cpi->resize_pending == 0) { + VP9_DENOISER_DECISION decision = COPY_BLOCK; + vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col, + VPXMAX(BLOCK_8X8, bsize), ctx, &decision); + // If INTRA mode was selected, re-evaluate ZEROMV on denoised result. + // Only do this under noise conditions, and if rdcost of ZEROMV on + // original source is not significantly higher than rdcost of INTRA MODE. + if (best_ref_frame == INTRA_FRAME && + decision == FILTER_BLOCK && + cpi->noise_estimate.enabled && + cpi->noise_estimate.level > kLow && + zero_last_cost_orig < (best_rdc.rdcost << 2)) { + // Check if we should pick ZEROMV on denoised signal. + int rate = 0; + int64_t dist = 0; + mbmi->mode = ZEROMV; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + mbmi->interp_filter = EIGHTTAP; + xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] + + cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]] + [INTER_OFFSET(ZEROMV)]; + this_rdc.dist = dist; + this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist); + // Switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is lower than INTRA (on original source). + if (this_rdc.rdcost > best_rdc.rdcost) { + this_rdc = best_rdc; + mbmi->mode = best_mode; + mbmi->ref_frame[0] = best_ref_frame; + mbmi->mv[0].as_int = INVALID_MV; + mbmi->interp_filter = best_pred_filter; + mbmi->tx_size = best_tx_size; + x->skip_txfm[0] = best_mode_skip_txfm; + } else { + best_ref_frame = LAST_FRAME; + best_rdc = this_rdc; + } + } + } +#endif + if (cpi->sf.adaptive_rd_thresh) { THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)]; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 13da155c7..b0617c1ca 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -279,7 +279,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) { + (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) { cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; } diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 4531d794a..441487130 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -283,31 +283,30 @@ void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } } -int16_t vp9_satd_sse2(const int16_t *coeff, int length) { +int vp9_satd_sse2(const int16_t *coeff, int length) { int i; - __m128i sum = _mm_load_si128((const __m128i *)coeff); - __m128i sign = _mm_srai_epi16(sum, 15); - __m128i val = _mm_xor_si128(sum, sign); - sum = _mm_sub_epi16(val, sign); - coeff += 8; - - for (i = 8; i < length; i += 8) { - __m128i src_line = _mm_load_si128((const __m128i *)coeff); - sign = _mm_srai_epi16(src_line, 15); - val = _mm_xor_si128(src_line, sign); - val = _mm_sub_epi16(val, sign); - sum = _mm_add_epi16(sum, val); + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); coeff += 8; } - val = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, val); - val = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, val); + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } - return _mm_extract_epi16(sum, 0); + return _mm_cvtsi128_si32(accum); } void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c index 68d1d8d3a..5c3fe93fd 100644 --- a/vpx/src/svc_encodeframe.c +++ b/vpx/src/svc_encodeframe.c @@ -423,13 +423,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, svc_ctx->temporal_layers = 2; } + for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) { + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; + } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; - si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl]; - si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; } } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d11b32778..1352c3462 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -890,7 +890,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/; + specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc"; diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm index 5ef7ae313..0189f32a0 100644 --- a/vpx_dsp/x86/intrapred_sse2.asm +++ b/vpx_dsp/x86/intrapred_sse2.asm @@ -561,33 +561,31 @@ cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left RET INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left +cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left pxor m1, m1 movd m2, [aboveq-1] movq m0, [aboveq] punpcklbw m2, m1 - punpcklbw m0, m1 - pshuflw m2, m2, 0x0 + punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] + pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] DEFINE_ARGS dst, stride, line, left mov lineq, -4 - punpcklqdq m2, m2 - add leftq, 8 - psubw m0, m2 -.loop: - movd m2, [leftq+lineq*2] - movd m3, [leftq+lineq*2+1] - punpcklbw m2, m1 - punpcklbw m3, m1 - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m2, m0 + punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] + psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] + movq m2, [leftq] + punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] +.loop + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] + punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] + paddw m4, m0 paddw m3, m0 - packuswb m2, m3 - movq [dstq ], m2 - movhps [dstq+strideq], m2 + packuswb m4, m3 + movq [dstq ], m4 + movhps [dstq+strideq], m4 lea dstq, [dstq+strideq*2] + psrldq m2, 4 inc lineq jnz .loop REP_RET diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 2675eab1f..43c7ad87e 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -57,6 +57,14 @@ TRANSFORM_COEFFS 1606, 16305 TRANSFORM_COEFFS 15679, 4756 TRANSFORM_COEFFS 11585, 11585 +; constants for 32x32_1024 +TRANSFORM_COEFFS 12140, 11003 +TRANSFORM_COEFFS 7005, 14811 +TRANSFORM_COEFFS 14053, 8423 +TRANSFORM_COEFFS 9760, 13160 +TRANSFORM_COEFFS 12665, 10394 +TRANSFORM_COEFFS 7723, 14449 + %macro PAIR_PP_COEFFS 2 dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 %endmacro @@ -368,23 +376,24 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride %define idx30 16 * 6 %define idx31 16 * 7 +; FROM idct32x32_add_neon.asm +; +; Instead of doing the transforms stage by stage, it is done by loading +; some input values and doing as many stages as possible to minimize the +; storing/loading of intermediate results. To fit within registers, the +; final coefficients are cut into four blocks: +; BLOCK A: 16-19,28-31 +; BLOCK B: 20-23,24-27 +; BLOCK C: 8-11,12-15 +; BLOCK D: 0-3,4-7 +; Blocks A and C are straight calculation through the various stages. In +; block B, further calculations are performed using the results from +; block A. In block D, further calculations are performed using the results +; from block C and then the final calculations are done using results from +; block A and B which have been combined at the end of block B. +; + %macro IDCT32X32_34 4 - ; FROM idct32x32_add_neon.asm - ; - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-11,12-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - ; ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, m1 pmulhrsw m1, [pw___804x2] ; stp1_16 @@ -475,7 +484,7 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 + SUM_SUB 6, 5, 9 pmulhrsw m6, m10 ; stp1_27 pmulhrsw m5, m10 ; stp1_20 SUM_SUB 13, 14, 9 @@ -539,10 +548,10 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 + SUM_SUB 5, 4, 9 pmulhrsw m5, m10 ; stp1_13 pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 + SUM_SUB 6, 7, 9 pmulhrsw m6, m10 ; stp1_12 pmulhrsw m7, m10 ; stp1_11 %else @@ -783,4 +792,443 @@ idct32x32_34_transpose_2: RECON_AND_STORE pass_two_start RET + +%macro IDCT32X32_1024 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m1, [rsp + transposed_in + 16 * 1] + mova m11, [rsp + transposed_in + 16 * 31] + BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 + + mova m0, [rsp + transposed_in + 16 * 15] + mova m2, [rsp + transposed_in + 16 * 17] + BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 + + mova m7, [rsp + transposed_in + 16 * 7] + mova m12, [rsp + transposed_in + 16 * 25] + BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 + + mova m3, [rsp + transposed_in + 16 * 9] + mova m4, [rsp + transposed_in + 16 * 23] + BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 + SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 + SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 + SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m3 + mova [stp + %4 + idx30], m2 + mova [stp + %4 + idx31], m11 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m5, [rsp + transposed_in + 16 * 5] + mova m6, [rsp + transposed_in + 16 * 27] + BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 + + mova m13, [rsp + transposed_in + 16 * 21] + mova m14, [rsp + transposed_in + 16 * 11] + BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 + + mova m0, [rsp + transposed_in + 16 * 13] + mova m1, [rsp + transposed_in + 16 * 19] + BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 + + mova m2, [rsp + transposed_in + 16 * 3] + mova m3, [rsp + transposed_in + 16 * 29] + BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 + SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 + SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 + SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m11, [stp + %3 + idx18] + mova m12, [stp + %3 + idx19] + SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 + SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m4 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m11 + mova [stp + %3 + idx19], m12 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m11, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m11 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 1, 0, 9 + pmulhrsw m1, m10 ; stp1_25 + pmulhrsw m0, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_25 + pmulhrsw m3, m10 ; stp1_22 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 1, 0 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + mova [stp + %3 + idx22], m0 + mova [stp + %3 + idx23], m3 + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m1 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m1, [rsp + transposed_in + 16 * 30] + BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 + + mova m2, [rsp + transposed_in + 16 * 14] + mova m3, [rsp + transposed_in + 16 * 18] + BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 + + mova m4, [rsp + transposed_in + 16 * 10] + mova m5, [rsp + transposed_in + 16 * 22] + BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 + + mova m6, [rsp + transposed_in + 16 * 6] + mova m7, [rsp + transposed_in + 16 * 26] + BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 + SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 + SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 + SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, [rsp + transposed_in + 16 * 28] + BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 + + mova m13, [rsp + transposed_in + 16 * 12] + mova m14, [rsp + transposed_in + 16 * 20] + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m1, [rsp + transposed_in + 16 * 16] + +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 0, 1, 9 + pmulhrsw m0, m10 ; stp1_1 + pmulhrsw m1, m10 ; stp1_0 +%else + BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 + SWAP 0, 1 +%endif + mova m2, [rsp + transposed_in + 16 * 8] + mova m3, [rsp + transposed_in + 16 * 24] + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 + + mova m10, [pw_11585x2] + SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 + SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 + SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %2 + idx12] + mova m5, [stp + %2 + idx13] + mova m6, [stp + %2 + idx14] + mova m7, [stp + %2 + idx15] + SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 + SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m10, [stp + %4 + idx31] + mova m15, [stp + %4 + idx30] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m1 + mova [stp + %4 + idx31], m10 + mova [stp + %4 + idx30], m15 + mova m0, [stp + %4 + idx29] + mova m1, [stp + %4 + idx28] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m3 + mova [stp + %4 + idx29], m0 + mova [stp + %4 + idx28], m1 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m1, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m3, [stp + %3 + idx19] + SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m4 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m6 + mova [stp + %2 + idx15], m7 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m1 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m3 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m3, [stp + %4 + idx24] + mova m2, [stp + %4 + idx25] + mova m1, [stp + %4 + idx26] + mova m0, [stp + %4 + idx27] + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + mova [stp + %4 + idx24], m3 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx27], m0 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +INIT_XMM ssse3 +cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + mov r6, 4 + lea stp, [rsp + pass_one_start] + +idct32x32_1024: + mov r3, inputq + lea r4, [rsp + transposed_in] + mov r7, 4 + +idct32x32_1024_transpose: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose + + IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 + + lea stp, [stp + 16 * 8] + lea inputq, [inputq + 16 * 32] + dec r6 + jnz idct32x32_1024 + + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_1024_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + mov r7, 4 + +idct32x32_1024_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 * 8 + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose_2 + + IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_1024_2 + + RECON_AND_STORE pass_two_start + + RET %endif |