diff options
58 files changed, 2972 insertions, 3990 deletions
diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist index 8d1da32fd..d157b11a0 100644 --- a/build/make/ios-Info.plist +++ b/build/make/ios-Info.plist @@ -31,5 +31,7 @@ <integer>1</integer> <integer>2</integer> </array> + <key>VPXFullVersion</key> + <string>${FULLVERSION}</string> </dict> </plist> diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh index 21610745c..96dc6cc8c 100755 --- a/build/make/iosbuild.sh +++ b/build/make/iosbuild.sh @@ -226,6 +226,7 @@ build_framework() { # Copy in Info.plist. cat "${SCRIPT_DIR}/ios-Info.plist" \ + | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \ | sed "s/\${VERSION}/${VERSION}/g" \ | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \ > "${FRAMEWORK_DIR}/Info.plist" @@ -341,8 +342,9 @@ if [ "${ENABLE_SHARED}" = "yes" ]; then CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}" fi -VERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}" \ - | sed -E 's/^v(.*)$/\1/') +FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBVPX_SOURCE_DIR}") +VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/') + if [ "$ENABLE_SHARED" = "yes" ]; then IOS_VERSION_OPTIONS="--enable-shared" else @@ -369,6 +371,7 @@ cat << EOF OSX_TARGETS="${OSX_TARGETS}" SIM_TARGETS="${SIM_TARGETS}" SCRIPT_DIR="${SCRIPT_DIR}" + FULLVERSION="${FULLVERSION}" VERSION="${VERSION}" IOS_VERSION_MIN="${IOS_VERSION_MIN}" EOF @@ -23,7 +23,7 @@ static void fix_framerate(int *num, int *den) { // we can guess the framerate using only the timebase in this // case. Other files would require reading ahead to guess the // timebase, like we do for webm. - if (*num < 1000) { + if (*den > 0 && *num > 0 && *num < 1000) { // Correct for the factor of 2 applied to the timebase in the encoder. if (*num & 1) *den *= 2; diff --git a/test/acm_random.h b/test/acm_random.h index a29ced2f7..b94b6e195 100644 --- a/test/acm_random.h +++ b/test/acm_random.h @@ -35,7 +35,7 @@ class ACMRandom { int16_t Rand9Signed(void) { // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). const uint32_t value = random_.Generate(512); - return static_cast<int16_t>(value - 256); + return static_cast<int16_t>(value) - 256; } uint8_t Rand8(void) { diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc index d25e4f5f5..e9945c409 100644 --- a/test/add_noise_test.cc +++ b/test/add_noise_test.cc @@ -185,11 +185,6 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { INSTANTIATE_TEST_CASE_P(C, AddNoiseTest, ::testing::Values(vpx_plane_add_noise_c)); -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P(MMX, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_mmx)); -#endif - #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest, ::testing::Values(vpx_plane_add_noise_sse2)); diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 5467c46cf..3941e16fc 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -450,7 +450,28 @@ class DatarateTestVP9Large : public ::libvpx_test::EncoderTest, int denoiser_offon_period_; }; -// Check basic rate targeting, +// Check basic rate targeting for VBR mode. +TEST_P(DatarateTestVP9Large, BasicRateTargetingVBR) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + for (int i = 400; i <= 800; i += 400) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + << " The datarate for the file is greater than target by too much!"; + } +} + +// Check basic rate targeting for CBR, TEST_P(DatarateTestVP9Large, BasicRateTargeting) { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 500; @@ -474,7 +495,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) { } } -// Check basic rate targeting, +// Check basic rate targeting for CBR. TEST_P(DatarateTestVP9Large, BasicRateTargeting444) { ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index c4dfaf345..ddaf9395b 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -373,10 +373,10 @@ class Trans16x16TestBase { for (int j = 0; j < kNumCoeffs; ++j) { #if CONFIG_VP9_HIGHBITDEPTH - const uint32_t diff = + const int32_t diff = bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; #else - const uint32_t diff = dst[j] - src[j]; + const int32_t diff = dst[j] - src[j]; #endif const uint32_t error = diff * diff; if (max_error < error) diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index 1cbac5c63..16d88255e 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc @@ -154,10 +154,10 @@ TEST_P(Trans32x32Test, AccuracyCheck) { for (int j = 0; j < kNumCoeffs; ++j) { #if CONFIG_VP9_HIGHBITDEPTH - const uint32_t diff = + const int32_t diff = bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; #else - const uint32_t diff = dst[j] - src[j]; + const int32_t diff = dst[j] - src[j]; #endif const uint32_t error = diff * diff; if (max_error < error) diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 0c91aee21..5a58830d5 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -487,19 +487,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); #endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \ - !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - MMX, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8))); -#endif - -#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \ - !CONFIG_EMULATE_HARDWARE +#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( SSE2, Trans4x4WHT, ::testing::Values( + make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8))); #endif diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index edf468216..0c081ee1f 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -458,7 +458,7 @@ class FwdTrans8x8TestBase { coeff_r[j] = static_cast<tran_low_t>(round(out_r[j])); for (int j = 0; j < kNumCoeffs; ++j) { - const uint32_t diff = coeff[j] - coeff_r[j]; + const int32_t diff = coeff[j] - coeff_r[j]; const uint32_t error = diff * diff; EXPECT_GE(9u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 DCT has error " << error diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 778a36ca3..94646e4ff 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) { using std::tr1::make_tuple; -#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - MMX, Loop8Test6Param, - ::testing::Values( - make_tuple(&vpx_lpf_horizontal_4_mmx, - &vpx_lpf_horizontal_4_c, 8), - make_tuple(&vpx_lpf_vertical_4_mmx, - &vpx_lpf_vertical_4_c, 8))); -#endif // HAVE_MMX - #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_sse2, + &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_horizontal_edge_8_sse2, &vpx_lpf_horizontal_edge_8_c, 8), make_tuple(&vpx_lpf_horizontal_edge_16_sse2, &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_4_sse2, + &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_sse2, diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 3e65fecfb..2acf744d5 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -191,14 +191,15 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c, INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, - vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, + NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, vpx_tm_predictor_4x4_sse2) #endif // HAVE_SSE2 && CONFIG_USE_X86INC #if HAVE_SSSE3 && CONFIG_USE_X86INC INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, - NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL, - vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3, + NULL, NULL, NULL, NULL, + vpx_d153_predictor_4x4_ssse3, NULL, vpx_d63_predictor_4x4_ssse3, NULL) #endif // HAVE_SSSE3 && CONFIG_USE_X86INC @@ -240,13 +241,13 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c, INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, - vpx_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, - NULL, vpx_tm_predictor_8x8_sse2) + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, + NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) #endif // HAVE_SSE2 && CONFIG_USE_X86INC #if HAVE_SSSE3 && CONFIG_USE_X86INC INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, - NULL, vpx_d45_predictor_8x8_ssse3, NULL, NULL, + NULL, NULL, NULL, NULL, vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) #endif // HAVE_SSSE3 && CONFIG_USE_X86INC diff --git a/test/variance_test.cc b/test/variance_test.cc index a6efc92d7..cb6339041 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -976,16 +976,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P( - MMX, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_mmx, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_mmx, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_mmx, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_mmx, 0))); -#endif // HAVE_MMX - #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, ::testing::Values(vpx_get_mb_ss_sse2)); @@ -1026,8 +1016,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse, 0))); + make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), + make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); INSTANTIATE_TEST_CASE_P( SSE2, VpxSubpelAvgVarianceTest, @@ -1043,8 +1033,8 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse, 0))); + make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), + make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #endif // CONFIG_USE_X86INC #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c index 403b209a2..00110f30b 100644 --- a/vp10/common/vp10_inv_txfm.c +++ b/vp10/common/vp10_inv_txfm.c @@ -35,10 +35,10 @@ void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = WRAPLOW(a1, 8); - op[1] = WRAPLOW(b1, 8); - op[2] = WRAPLOW(c1, 8); - op[3] = WRAPLOW(d1, 8); + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); ip += 4; op += 4; } @@ -78,8 +78,8 @@ void vp10_iwht4x4_1_add_c(const tran_low_t *in, a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = WRAPLOW(a1, 8); - op[1] = op[2] = op[3] = WRAPLOW(e1, 8); + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); ip = tmp; for (i = 0; i < 4; i++) { @@ -100,18 +100,18 @@ void vp10_idct4_c(const tran_low_t *input, tran_low_t *output) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 - output[0] = WRAPLOW(step[0] + step[3], 8); - output[1] = WRAPLOW(step[1] + step[2], 8); - output[2] = WRAPLOW(step[1] - step[2], 8); - output[3] = WRAPLOW(step[0] - step[3], 8); + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); } void vp10_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -143,8 +143,8 @@ void vp10_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -166,48 +166,48 @@ void vp10_idct8_c(const tran_low_t *input, tran_low_t *output) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 temp1 = (step1[0] + step1[2]) * cospi_16_64; temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], 8); - output[1] = WRAPLOW(step1[1] + step1[6], 8); - output[2] = WRAPLOW(step1[2] + step1[5], 8); - output[3] = WRAPLOW(step1[3] + step1[4], 8); - output[4] = WRAPLOW(step1[3] - step1[4], 8); - output[5] = WRAPLOW(step1[2] - step1[5], 8); - output[6] = WRAPLOW(step1[1] - step1[6], 8); - output[7] = WRAPLOW(step1[0] - step1[7], 8); + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); } void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -238,8 +238,8 @@ void vp10_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vp10_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -279,10 +279,10 @@ void vp10_iadst4_c(const tran_low_t *input, tran_low_t *output) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); - output[2] = WRAPLOW(dct_const_round_shift(s2), 8); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) { @@ -313,14 +313,14 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); - x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); // stage 2 s0 = (int)x0; @@ -332,14 +332,14 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); - x0 = WRAPLOW(s0 + s2, 8); - x1 = WRAPLOW(s1 + s3, 8); - x2 = WRAPLOW(s0 - s2, 8); - x3 = WRAPLOW(s1 - s3, 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); // stage 3 s2 = (int)(cospi_16_64 * (x2 + x3)); @@ -347,19 +347,19 @@ void vp10_iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(cospi_16_64 * (x6 + x7)); s7 = (int)(cospi_16_64 * (x6 - x7)); - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x4, 8); - output[2] = WRAPLOW(x6, 8); - output[3] = WRAPLOW(-x2, 8); - output[4] = WRAPLOW(x3, 8); - output[5] = WRAPLOW(-x7, 8); - output[6] = WRAPLOW(x5, 8); - output[7] = WRAPLOW(-x1, 8); + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); } void vp10_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -422,23 +422,23 @@ void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); // stage 3 step1[0] = step2[0]; @@ -448,109 +448,109 @@ void vp10_idct16_c(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], 8); - output[1] = WRAPLOW(step2[1] + step2[14], 8); - output[2] = WRAPLOW(step2[2] + step2[13], 8); - output[3] = WRAPLOW(step2[3] + step2[12], 8); - output[4] = WRAPLOW(step2[4] + step2[11], 8); - output[5] = WRAPLOW(step2[5] + step2[10], 8); - output[6] = WRAPLOW(step2[6] + step2[9], 8); - output[7] = WRAPLOW(step2[7] + step2[8], 8); - output[8] = WRAPLOW(step2[7] - step2[8], 8); - output[9] = WRAPLOW(step2[6] - step2[9], 8); - output[10] = WRAPLOW(step2[5] - step2[10], 8); - output[11] = WRAPLOW(step2[4] - step2[11], 8); - output[12] = WRAPLOW(step2[3] - step2[12], 8); - output[13] = WRAPLOW(step2[2] - step2[13], 8); - output[14] = WRAPLOW(step2[1] - step2[14], 8); - output[15] = WRAPLOW(step2[0] - step2[15], 8); + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); } void vp10_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, @@ -627,22 +627,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); // stage 2 s0 = x0; @@ -662,22 +662,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = WRAPLOW(s0 + s4, 8); - x1 = WRAPLOW(s1 + s5, 8); - x2 = WRAPLOW(s2 + s6, 8); - x3 = WRAPLOW(s3 + s7, 8); - x4 = WRAPLOW(s0 - s4, 8); - x5 = WRAPLOW(s1 - s5, 8); - x6 = WRAPLOW(s2 - s6, 8); - x7 = WRAPLOW(s3 - s7, 8); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); // stage 3 s0 = x0; @@ -697,22 +697,22 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = WRAPLOW(check_range(s0 + s2), 8); - x1 = WRAPLOW(check_range(s1 + s3), 8); - x2 = WRAPLOW(check_range(s0 - s2), 8); - x3 = WRAPLOW(check_range(s1 - s3), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); - x8 = WRAPLOW(check_range(s8 + s10), 8); - x9 = WRAPLOW(check_range(s9 + s11), 8); - x10 = WRAPLOW(check_range(s8 - s10), 8); - x11 = WRAPLOW(check_range(s9 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -724,31 +724,31 @@ void vp10_iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - x10 = WRAPLOW(dct_const_round_shift(s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s11), 8); - x14 = WRAPLOW(dct_const_round_shift(s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s15), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x8, 8); - output[2] = WRAPLOW(x12, 8); - output[3] = WRAPLOW(-x4, 8); - output[4] = WRAPLOW(x6, 8); - output[5] = WRAPLOW(x14, 8); - output[6] = WRAPLOW(x10, 8); - output[7] = WRAPLOW(x2, 8); - output[8] = WRAPLOW(x3, 8); - output[9] = WRAPLOW(x11, 8); - output[10] = WRAPLOW(x15, 8); - output[11] = WRAPLOW(x7, 8); - output[12] = WRAPLOW(x5, 8); - output[13] = WRAPLOW(-x13, 8); - output[14] = WRAPLOW(x9, 8); - output[15] = WRAPLOW(-x1, 8); + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); } void vp10_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, @@ -784,8 +784,8 @@ void vp10_idct16x16_1_add_c(const tran_low_t *input, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -818,43 +818,43 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 step2[0] = step1[0]; @@ -868,40 +868,40 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step2[16] = WRAPLOW(step1[16] + step1[17], 8); - step2[17] = WRAPLOW(step1[16] - step1[17], 8); - step2[18] = WRAPLOW(-step1[18] + step1[19], 8); - step2[19] = WRAPLOW(step1[18] + step1[19], 8); - step2[20] = WRAPLOW(step1[20] + step1[21], 8); - step2[21] = WRAPLOW(step1[20] - step1[21], 8); - step2[22] = WRAPLOW(-step1[22] + step1[23], 8); - step2[23] = WRAPLOW(step1[22] + step1[23], 8); - step2[24] = WRAPLOW(step1[24] + step1[25], 8); - step2[25] = WRAPLOW(step1[24] - step1[25], 8); - step2[26] = WRAPLOW(-step1[26] + step1[27], 8); - step2[27] = WRAPLOW(step1[26] + step1[27], 8); - step2[28] = WRAPLOW(step1[28] + step1[29], 8); - step2[29] = WRAPLOW(step1[28] - step1[29], 8); - step2[30] = WRAPLOW(-step1[30] + step1[31], 8); - step2[31] = WRAPLOW(step1[30] + step1[31], 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); // stage 3 step1[0] = step2[0]; @@ -911,42 +911,42 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -955,87 +955,87 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = WRAPLOW(step1[16] + step1[19], 8); - step2[17] = WRAPLOW(step1[17] + step1[18], 8); - step2[18] = WRAPLOW(step1[17] - step1[18], 8); - step2[19] = WRAPLOW(step1[16] - step1[19], 8); - step2[20] = WRAPLOW(-step1[20] + step1[23], 8); - step2[21] = WRAPLOW(-step1[21] + step1[22], 8); - step2[22] = WRAPLOW(step1[21] + step1[22], 8); - step2[23] = WRAPLOW(step1[20] + step1[23], 8); - - step2[24] = WRAPLOW(step1[24] + step1[27], 8); - step2[25] = WRAPLOW(step1[25] + step1[26], 8); - step2[26] = WRAPLOW(step1[25] - step1[26], 8); - step2[27] = WRAPLOW(step1[24] - step1[27], 8); - step2[28] = WRAPLOW(-step1[28] + step1[31], 8); - step2[29] = WRAPLOW(-step1[29] + step1[30], 8); - step2[30] = WRAPLOW(step1[29] + step1[30], 8); - step2[31] = WRAPLOW(step1[28] + step1[31], 8); + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -1044,62 +1044,62 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { step1[31] = step2[31]; // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); step2[14] = step1[14]; step2[15] = step1[15]; - step2[16] = WRAPLOW(step1[16] + step1[23], 8); - step2[17] = WRAPLOW(step1[17] + step1[22], 8); - step2[18] = WRAPLOW(step1[18] + step1[21], 8); - step2[19] = WRAPLOW(step1[19] + step1[20], 8); - step2[20] = WRAPLOW(step1[19] - step1[20], 8); - step2[21] = WRAPLOW(step1[18] - step1[21], 8); - step2[22] = WRAPLOW(step1[17] - step1[22], 8); - step2[23] = WRAPLOW(step1[16] - step1[23], 8); - - step2[24] = WRAPLOW(-step1[24] + step1[31], 8); - step2[25] = WRAPLOW(-step1[25] + step1[30], 8); - step2[26] = WRAPLOW(-step1[26] + step1[29], 8); - step2[27] = WRAPLOW(-step1[27] + step1[28], 8); - step2[28] = WRAPLOW(step1[27] + step1[28], 8); - step2[29] = WRAPLOW(step1[26] + step1[29], 8); - step2[30] = WRAPLOW(step1[25] + step1[30], 8); - step2[31] = WRAPLOW(step1[24] + step1[31], 8); + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], 8); - step1[1] = WRAPLOW(step2[1] + step2[14], 8); - step1[2] = WRAPLOW(step2[2] + step2[13], 8); - step1[3] = WRAPLOW(step2[3] + step2[12], 8); - step1[4] = WRAPLOW(step2[4] + step2[11], 8); - step1[5] = WRAPLOW(step2[5] + step2[10], 8); - step1[6] = WRAPLOW(step2[6] + step2[9], 8); - step1[7] = WRAPLOW(step2[7] + step2[8], 8); - step1[8] = WRAPLOW(step2[7] - step2[8], 8); - step1[9] = WRAPLOW(step2[6] - step2[9], 8); - step1[10] = WRAPLOW(step2[5] - step2[10], 8); - step1[11] = WRAPLOW(step2[4] - step2[11], 8); - step1[12] = WRAPLOW(step2[3] - step2[12], 8); - step1[13] = WRAPLOW(step2[2] - step2[13], 8); - step1[14] = WRAPLOW(step2[1] - step2[14], 8); - step1[15] = WRAPLOW(step2[0] - step2[15], 8); + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); step1[16] = step2[16]; step1[17] = step2[17]; @@ -1107,58 +1107,58 @@ void vp10_idct32_c(const tran_low_t *input, tran_low_t *output) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = WRAPLOW(step1[0] + step1[31], 8); - output[1] = WRAPLOW(step1[1] + step1[30], 8); - output[2] = WRAPLOW(step1[2] + step1[29], 8); - output[3] = WRAPLOW(step1[3] + step1[28], 8); - output[4] = WRAPLOW(step1[4] + step1[27], 8); - output[5] = WRAPLOW(step1[5] + step1[26], 8); - output[6] = WRAPLOW(step1[6] + step1[25], 8); - output[7] = WRAPLOW(step1[7] + step1[24], 8); - output[8] = WRAPLOW(step1[8] + step1[23], 8); - output[9] = WRAPLOW(step1[9] + step1[22], 8); - output[10] = WRAPLOW(step1[10] + step1[21], 8); - output[11] = WRAPLOW(step1[11] + step1[20], 8); - output[12] = WRAPLOW(step1[12] + step1[19], 8); - output[13] = WRAPLOW(step1[13] + step1[18], 8); - output[14] = WRAPLOW(step1[14] + step1[17], 8); - output[15] = WRAPLOW(step1[15] + step1[16], 8); - output[16] = WRAPLOW(step1[15] - step1[16], 8); - output[17] = WRAPLOW(step1[14] - step1[17], 8); - output[18] = WRAPLOW(step1[13] - step1[18], 8); - output[19] = WRAPLOW(step1[12] - step1[19], 8); - output[20] = WRAPLOW(step1[11] - step1[20], 8); - output[21] = WRAPLOW(step1[10] - step1[21], 8); - output[22] = WRAPLOW(step1[9] - step1[22], 8); - output[23] = WRAPLOW(step1[8] - step1[23], 8); - output[24] = WRAPLOW(step1[7] - step1[24], 8); - output[25] = WRAPLOW(step1[6] - step1[25], 8); - output[26] = WRAPLOW(step1[5] - step1[26], 8); - output[27] = WRAPLOW(step1[4] - step1[27], 8); - output[28] = WRAPLOW(step1[3] - step1[28], 8); - output[29] = WRAPLOW(step1[2] - step1[29], 8); - output[30] = WRAPLOW(step1[1] - step1[30], 8); - output[31] = WRAPLOW(step1[0] - step1[31], 8); + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); } void vp10_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, @@ -1234,8 +1234,8 @@ void vp10_idct32x32_1_add_c(const tran_low_t *input, int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { @@ -1269,10 +1269,10 @@ void vp10_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = WRAPLOW(a1, bd); - op[1] = WRAPLOW(b1, bd); - op[2] = WRAPLOW(c1, bd); - op[3] = WRAPLOW(d1, bd); + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); ip += 4; op += 4; } @@ -1313,8 +1313,8 @@ void vp10_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = WRAPLOW(a1, bd); - op[1] = op[2] = op[3] = WRAPLOW(e1, bd); + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); ip = tmp; for (i = 0; i < 4; i++) { @@ -1340,18 +1340,18 @@ void vp10_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 - output[0] = WRAPLOW(step[0] + step[3], bd); - output[1] = WRAPLOW(step[1] + step[2], bd); - output[2] = WRAPLOW(step[1] - step[2], bd); - output[3] = WRAPLOW(step[0] - step[3], bd); + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } void vp10_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1385,11 +1385,11 @@ void vp10_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1411,39 +1411,39 @@ void vp10_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 & stage 3 - even half vp10_highbd_idct4_c(step1, step1, bd); // stage 2 - odd half - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); // stage 3 - odd half step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], bd); - output[1] = WRAPLOW(step1[1] + step1[6], bd); - output[2] = WRAPLOW(step1[2] + step1[5], bd); - output[3] = WRAPLOW(step1[3] + step1[4], bd); - output[4] = WRAPLOW(step1[3] - step1[4], bd); - output[5] = WRAPLOW(step1[2] - step1[5], bd); - output[6] = WRAPLOW(step1[1] - step1[6], bd); - output[7] = WRAPLOW(step1[0] - step1[7], bd); + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } void vp10_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1477,10 +1477,10 @@ void vp10_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -1521,10 +1521,10 @@ void vp10_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); - output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); - output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); + output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); } void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -1555,14 +1555,14 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); // stage 2 s0 = x0; @@ -1574,14 +1574,14 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1589,19 +1589,19 @@ void vp10_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x4, bd); - output[2] = WRAPLOW(x6, bd); - output[3] = WRAPLOW(-x2, bd); - output[4] = WRAPLOW(x3, bd); - output[5] = WRAPLOW(-x7, bd); - output[6] = WRAPLOW(x5, bd); - output[7] = WRAPLOW(-x1, bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); } void vp10_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1666,23 +1666,23 @@ void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 3 step1[0] = step2[0]; @@ -1692,109 +1692,109 @@ void vp10_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], bd); - output[1] = WRAPLOW(step2[1] + step2[14], bd); - output[2] = WRAPLOW(step2[2] + step2[13], bd); - output[3] = WRAPLOW(step2[3] + step2[12], bd); - output[4] = WRAPLOW(step2[4] + step2[11], bd); - output[5] = WRAPLOW(step2[5] + step2[10], bd); - output[6] = WRAPLOW(step2[6] + step2[9], bd); - output[7] = WRAPLOW(step2[7] + step2[8], bd); - output[8] = WRAPLOW(step2[7] - step2[8], bd); - output[9] = WRAPLOW(step2[6] - step2[9], bd); - output[10] = WRAPLOW(step2[5] - step2[10], bd); - output[11] = WRAPLOW(step2[4] - step2[11], bd); - output[12] = WRAPLOW(step2[3] - step2[12], bd); - output[13] = WRAPLOW(step2[2] - step2[13], bd); - output[14] = WRAPLOW(step2[1] - step2[14], bd); - output[15] = WRAPLOW(step2[0] - step2[15], bd); + output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } void vp10_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1874,22 +1874,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; @@ -1909,22 +1909,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = WRAPLOW(s0 + s4, bd); - x1 = WRAPLOW(s1 + s5, bd); - x2 = WRAPLOW(s2 + s6, bd); - x3 = WRAPLOW(s3 + s7, bd); - x4 = WRAPLOW(s0 - s4, bd); - x5 = WRAPLOW(s1 - s5, bd); - x6 = WRAPLOW(s2 - s6, bd); - x7 = WRAPLOW(s3 - s7, bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); // stage 3 s0 = x0; @@ -1944,22 +1944,22 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); - x8 = WRAPLOW(s8 + s10, bd); - x9 = WRAPLOW(s9 + s11, bd); - x10 = WRAPLOW(s8 - s10, bd); - x11 = WRAPLOW(s9 - s11, bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -1971,31 +1971,31 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x8, bd); - output[2] = WRAPLOW(x12, bd); - output[3] = WRAPLOW(-x4, bd); - output[4] = WRAPLOW(x6, bd); - output[5] = WRAPLOW(x14, bd); - output[6] = WRAPLOW(x10, bd); - output[7] = WRAPLOW(x2, bd); - output[8] = WRAPLOW(x3, bd); - output[9] = WRAPLOW(x11, bd); - output[10] = WRAPLOW(x15, bd); - output[11] = WRAPLOW(x7, bd); - output[12] = WRAPLOW(x5, bd); - output[13] = WRAPLOW(-x13, bd); - output[14] = WRAPLOW(x9, bd); - output[15] = WRAPLOW(-x1, bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); } void vp10_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2030,11 +2030,11 @@ void vp10_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -2069,43 +2069,43 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 step2[0] = step1[0]; @@ -2119,40 +2119,40 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step2[16] = WRAPLOW(step1[16] + step1[17], bd); - step2[17] = WRAPLOW(step1[16] - step1[17], bd); - step2[18] = WRAPLOW(-step1[18] + step1[19], bd); - step2[19] = WRAPLOW(step1[18] + step1[19], bd); - step2[20] = WRAPLOW(step1[20] + step1[21], bd); - step2[21] = WRAPLOW(step1[20] - step1[21], bd); - step2[22] = WRAPLOW(-step1[22] + step1[23], bd); - step2[23] = WRAPLOW(step1[22] + step1[23], bd); - step2[24] = WRAPLOW(step1[24] + step1[25], bd); - step2[25] = WRAPLOW(step1[24] - step1[25], bd); - step2[26] = WRAPLOW(-step1[26] + step1[27], bd); - step2[27] = WRAPLOW(step1[26] + step1[27], bd); - step2[28] = WRAPLOW(step1[28] + step1[29], bd); - step2[29] = WRAPLOW(step1[28] - step1[29], bd); - step2[30] = WRAPLOW(-step1[30] + step1[31], bd); - step2[31] = WRAPLOW(step1[30] + step1[31], bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); + step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); + step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); + step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); + step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); + step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); // stage 3 step1[0] = step2[0]; @@ -2162,42 +2162,42 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2206,87 +2206,87 @@ static void highbd_idct32_c(const tran_low_t *input, // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = WRAPLOW(step1[16] + step1[19], bd); - step2[17] = WRAPLOW(step1[17] + step1[18], bd); - step2[18] = WRAPLOW(step1[17] - step1[18], bd); - step2[19] = WRAPLOW(step1[16] - step1[19], bd); - step2[20] = WRAPLOW(-step1[20] + step1[23], bd); - step2[21] = WRAPLOW(-step1[21] + step1[22], bd); - step2[22] = WRAPLOW(step1[21] + step1[22], bd); - step2[23] = WRAPLOW(step1[20] + step1[23], bd); - - step2[24] = WRAPLOW(step1[24] + step1[27], bd); - step2[25] = WRAPLOW(step1[25] + step1[26], bd); - step2[26] = WRAPLOW(step1[25] - step1[26], bd); - step2[27] = WRAPLOW(step1[24] - step1[27], bd); - step2[28] = WRAPLOW(-step1[28] + step1[31], bd); - step2[29] = WRAPLOW(-step1[29] + step1[30], bd); - step2[30] = WRAPLOW(step1[29] + step1[30], bd); - step2[31] = WRAPLOW(step1[28] + step1[31], bd); + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); + step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); + step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); + step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); + step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); + step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2295,62 +2295,62 @@ static void highbd_idct32_c(const tran_low_t *input, step1[31] = step2[31]; // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; - step2[16] = WRAPLOW(step1[16] + step1[23], bd); - step2[17] = WRAPLOW(step1[17] + step1[22], bd); - step2[18] = WRAPLOW(step1[18] + step1[21], bd); - step2[19] = WRAPLOW(step1[19] + step1[20], bd); - step2[20] = WRAPLOW(step1[19] - step1[20], bd); - step2[21] = WRAPLOW(step1[18] - step1[21], bd); - step2[22] = WRAPLOW(step1[17] - step1[22], bd); - step2[23] = WRAPLOW(step1[16] - step1[23], bd); - - step2[24] = WRAPLOW(-step1[24] + step1[31], bd); - step2[25] = WRAPLOW(-step1[25] + step1[30], bd); - step2[26] = WRAPLOW(-step1[26] + step1[29], bd); - step2[27] = WRAPLOW(-step1[27] + step1[28], bd); - step2[28] = WRAPLOW(step1[27] + step1[28], bd); - step2[29] = WRAPLOW(step1[26] + step1[29], bd); - step2[30] = WRAPLOW(step1[25] + step1[30], bd); - step2[31] = WRAPLOW(step1[24] + step1[31], bd); + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); + step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); + step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); + step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); + step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); + step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], bd); - step1[1] = WRAPLOW(step2[1] + step2[14], bd); - step1[2] = WRAPLOW(step2[2] + step2[13], bd); - step1[3] = WRAPLOW(step2[3] + step2[12], bd); - step1[4] = WRAPLOW(step2[4] + step2[11], bd); - step1[5] = WRAPLOW(step2[5] + step2[10], bd); - step1[6] = WRAPLOW(step2[6] + step2[9], bd); - step1[7] = WRAPLOW(step2[7] + step2[8], bd); - step1[8] = WRAPLOW(step2[7] - step2[8], bd); - step1[9] = WRAPLOW(step2[6] - step2[9], bd); - step1[10] = WRAPLOW(step2[5] - step2[10], bd); - step1[11] = WRAPLOW(step2[4] - step2[11], bd); - step1[12] = WRAPLOW(step2[3] - step2[12], bd); - step1[13] = WRAPLOW(step2[2] - step2[13], bd); - step1[14] = WRAPLOW(step2[1] - step2[14], bd); - step1[15] = WRAPLOW(step2[0] - step2[15], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; @@ -2358,58 +2358,58 @@ static void highbd_idct32_c(const tran_low_t *input, step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = WRAPLOW(step1[0] + step1[31], bd); - output[1] = WRAPLOW(step1[1] + step1[30], bd); - output[2] = WRAPLOW(step1[2] + step1[29], bd); - output[3] = WRAPLOW(step1[3] + step1[28], bd); - output[4] = WRAPLOW(step1[4] + step1[27], bd); - output[5] = WRAPLOW(step1[5] + step1[26], bd); - output[6] = WRAPLOW(step1[6] + step1[25], bd); - output[7] = WRAPLOW(step1[7] + step1[24], bd); - output[8] = WRAPLOW(step1[8] + step1[23], bd); - output[9] = WRAPLOW(step1[9] + step1[22], bd); - output[10] = WRAPLOW(step1[10] + step1[21], bd); - output[11] = WRAPLOW(step1[11] + step1[20], bd); - output[12] = WRAPLOW(step1[12] + step1[19], bd); - output[13] = WRAPLOW(step1[13] + step1[18], bd); - output[14] = WRAPLOW(step1[14] + step1[17], bd); - output[15] = WRAPLOW(step1[15] + step1[16], bd); - output[16] = WRAPLOW(step1[15] - step1[16], bd); - output[17] = WRAPLOW(step1[14] - step1[17], bd); - output[18] = WRAPLOW(step1[13] - step1[18], bd); - output[19] = WRAPLOW(step1[12] - step1[19], bd); - output[20] = WRAPLOW(step1[11] - step1[20], bd); - output[21] = WRAPLOW(step1[10] - step1[21], bd); - output[22] = WRAPLOW(step1[9] - step1[22], bd); - output[23] = WRAPLOW(step1[8] - step1[23], bd); - output[24] = WRAPLOW(step1[7] - step1[24], bd); - output[25] = WRAPLOW(step1[6] - step1[25], bd); - output[26] = WRAPLOW(step1[5] - step1[26], bd); - output[27] = WRAPLOW(step1[4] - step1[27], bd); - output[28] = WRAPLOW(step1[3] - step1[28], bd); - output[29] = WRAPLOW(step1[2] - step1[29], bd); - output[30] = WRAPLOW(step1[1] - step1[30], bd); - output[31] = WRAPLOW(step1[0] - step1[31], bd); + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); + output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); + output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); + output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); + output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); + output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); + output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); + output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); + output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); + output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); + output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); + output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); + output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); + output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); + output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); + output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); + output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); + output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); + output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); + output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); + output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); + output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); + output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); + output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); + output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); + output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); + output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); + output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); + output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } void vp10_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2485,9 +2485,9 @@ void vp10_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/vp10/common/vp10_inv_txfm.h b/vp10/common/vp10_inv_txfm.h index 52611acbd..1751f62c3 100644 --- a/vp10/common/vp10_inv_txfm.h +++ b/vp10/common/vp10_inv_txfm.h @@ -15,13 +15,14 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/inv_txfm.h" #include "vpx_ports/mem.h" #ifdef __cplusplus extern "C" { #endif -static INLINE tran_low_t check_range(tran_high_t input) { +static INLINE tran_high_t check_range(tran_high_t input) { #if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid VP9 input streams, intermediate stage coefficients should always // stay within the range of a signed 16 bit integer. Coefficients can go out @@ -32,17 +33,17 @@ static INLINE tran_low_t check_range(tran_high_t input) { assert(INT16_MIN <= input); assert(input <= INT16_MAX); #endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return (tran_low_t)input; + return input; } -static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return check_range(rv); + return rv; } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE tran_low_t highbd_check_range(tran_high_t input, - int bd) { +static INLINE tran_high_t highbd_check_range(tran_high_t input, + int bd) { #if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid highbitdepth VP9 streams, intermediate stage coefficients will // stay within the ranges: @@ -56,13 +57,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input, (void) int_min; #endif // CONFIG_COEFFICIENT_RANGE_CHECKING (void) bd; - return (tran_low_t)input; + return input; } -static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, - int bd) { +static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return highbd_check_range(rv, bd); + return rv; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -83,9 +83,21 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits -#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd)) -#else -#define WRAPLOW(x, bd) ((int32_t)(x)) + +#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#else // CONFIG_EMULATE_HARDWARE + +#define WRAPLOW(x) ((int32_t)check_range(x)) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((int32_t)highbd_check_range((x), bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH + #endif // CONFIG_EMULATE_HARDWARE void vp10_idct4_c(const tran_low_t *input, tran_low_t *output); @@ -107,14 +119,14 @@ void vp10_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, int bd) { - trans = WRAPLOW(trans, bd); - return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd); + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + trans, bd); } #endif static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { - trans = WRAPLOW(trans, 8); - return clip_pixel(WRAPLOW(dest + trans, 8)); + trans = WRAPLOW(trans); + return clip_pixel(dest + trans); } #ifdef __cplusplus } // extern "C" diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index f2414f811..c8a10e57f 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -398,7 +398,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp10_fht16x16 sse2/; add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fwht4x4/, "$mmx_x86inc"; + specialize qw/vp10_fwht4x4/, "$sse2_x86inc"; } else { add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp10_fht4x4 sse2 msa/; @@ -410,7 +410,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp10_fht16x16 sse2 msa/; add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc"; + specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc"; } # Inverse transform diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_intrin_sse2.c index e1111570a..e1111570a 100644 --- a/vp10/encoder/x86/dct_sse2.c +++ b/vp10/encoder/x86/dct_intrin_sse2.c diff --git a/vp10/encoder/x86/dct_mmx.asm b/vp10/encoder/x86/dct_mmx.asm deleted file mode 100644 index 2327fe9e6..000000000 --- a/vp10/encoder/x86/dct_mmx.asm +++ /dev/null @@ -1,104 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp10 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro TRANSFORM_COLS 0 - paddw m0, m1 - movq m4, m0 - psubw m3, m2 - psubw m4, m3 - psraw m4, 1 - movq m5, m4 - psubw m5, m1 ;b1 - psubw m4, m2 ;c1 - psubw m0, m4 - paddw m3, m5 - ; m0 a0 - SWAP 1, 4 ; m1 c1 - SWAP 2, 3 ; m2 d1 - SWAP 3, 5 ; m3 b1 -%endmacro - -%macro TRANSPOSE_4X4 0 - movq m4, m0 - movq m5, m2 - punpcklwd m4, m1 - punpckhwd m0, m1 - punpcklwd m5, m3 - punpckhwd m2, m3 - movq m1, m4 - movq m3, m0 - punpckldq m1, m5 - punpckhdq m4, m5 - punpckldq m3, m2 - punpckhdq m0, m2 - SWAP 2, 3, 0, 1, 4 -%endmacro - -INIT_MMX mmx -cglobal fwht4x4, 3, 4, 8, input, output, stride - lea r3q, [inputq + strideq*4] - movq m0, [inputq] ;a1 - movq m1, [inputq + strideq*2] ;b1 - movq m2, [r3q] ;c1 - movq m3, [r3q + strideq*2] ;d1 - - TRANSFORM_COLS - TRANSPOSE_4X4 - TRANSFORM_COLS - TRANSPOSE_4X4 - - psllw m0, 2 - psllw m1, 2 - psllw m2, 2 - psllw m3, 2 - -%if CONFIG_VP9_HIGHBITDEPTH - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m0 - pcmpgtw m5, m1 - movq m6, m0 - movq m7, m1 - punpcklwd m0, m4 - punpcklwd m1, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq], m0 - movq [outputq + 8], m6 - movq [outputq + 16], m1 - movq [outputq + 24], m7 - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m2 - pcmpgtw m5, m3 - movq m6, m2 - movq m7, m3 - punpcklwd m2, m4 - punpcklwd m3, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq + 32], m2 - movq [outputq + 40], m6 - movq [outputq + 48], m3 - movq [outputq + 56], m7 -%else - movq [outputq], m0 - movq [outputq + 8], m1 - movq [outputq + 16], m2 - movq [outputq + 24], m3 -%endif - - RET diff --git a/vp10/encoder/x86/dct_sse2.asm b/vp10/encoder/x86/dct_sse2.asm new file mode 100644 index 000000000..c3a5fb552 --- /dev/null +++ b/vp10/encoder/x86/dct_sse2.asm @@ -0,0 +1,86 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp10 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + +%if CONFIG_VP9_HIGHBITDEPTH + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 +%else + mova [outputq], m0 + mova [outputq + 16], m1 +%endif + + RET diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk index dc3b27139..4f265b539 100644 --- a/vp10/vp10cx.mk +++ b/vp10/vp10cx.mk @@ -93,7 +93,7 @@ VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/highbd_block_error_intrin_sse2.c endif ifeq ($(CONFIG_USE_X86INC),yes) -VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm +VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm endif @@ -103,7 +103,7 @@ VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm endif endif -VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c +VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c index 8a7e33205..5cdd2a249 100644 --- a/vp8/decoder/dboolhuff.c +++ b/vp8/decoder/dboolhuff.c @@ -44,7 +44,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br) int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); size_t bytes_left = br->user_buffer_end - bufptr; size_t bits_left = bytes_left * CHAR_BIT; - int x = (int)(shift + CHAR_BIT - bits_left); + int x = shift + CHAR_BIT - (int)bits_left; int loop_end = 0; unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1]; diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h index cc9eaaf43..1b1bbf868 100644 --- a/vp8/decoder/dboolhuff.h +++ b/vp8/decoder/dboolhuff.h @@ -83,7 +83,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { } { - register unsigned int shift = vp8_norm[range]; + register int shift = vp8_norm[range]; range <<= shift; value <<= shift; count -= shift; diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c index f3d91b552..3196422c2 100644 --- a/vp8/encoder/bitstream.c +++ b/vp8/encoder/bitstream.c @@ -163,7 +163,7 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { const TOKENEXTRA *stop = p + xcount; unsigned int split; - unsigned int shift; + int shift; int count = w->count; unsigned int range = w->range; unsigned int lowvalue = w->lowvalue; diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 7c012a829..e66a2dbd8 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -65,7 +65,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; - register unsigned int shift; + register int shift; #ifdef VP8_ENTROPY_STATS #if defined(SECTIONBITS_OUTPUT) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 4c2acc774..95bb39400 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -2808,7 +2808,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) * static scene. */ if ( detect_transition_to_still( cpi, i, - (cpi->key_frame_frequency-i), + ((int)(cpi->key_frame_frequency) - + (int)i), loop_decay_rate, decay_accumulator ) ) { diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 9063cea76..6507ae9f1 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1899,7 +1899,8 @@ static int calculate_final_rd_costs(int this_rd, int prob_skip_cost; prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1); - prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0); + prob_skip_cost -= + (int)vp8_cost_bit(cpi->prob_skip_false, 0); rd->rate2 += prob_skip_cost; *other_cost += prob_skip_cost; } diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 461462552..183dec4e7 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -298,196 +298,168 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { static void filter_selectively_vert_row2(int subsampling_factor, uint8_t *s, int pitch, - unsigned int mask_16x16_l, - unsigned int mask_8x8_l, - unsigned int mask_4x4_l, - unsigned int mask_4x4_int_l, - const loop_filter_info_n *lfi_n, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { - const int mask_shift = subsampling_factor ? 4 : 8; - const int mask_cutoff = subsampling_factor ? 0xf : 0xff; + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; const int lfl_forward = subsampling_factor ? 4 : 8; - - unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff; - unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff; - unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff; - unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff; - unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff; - unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff; - unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff; - unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff; + const unsigned int dual_one = 1 | (1 << lfl_forward); unsigned int mask; - - for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 | - mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; - mask; mask >>= 1) { - const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; - const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - - if (mask & 1) { - if ((mask_16x16_0 | mask_16x16_1) & 1) { - if ((mask_16x16_0 & mask_16x16_1) & 1) { - vpx_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr); - } else if (mask_16x16_0 & 1) { - vpx_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr); + uint8_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr); } else { - vpx_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr); + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); } } - if ((mask_8x8_0 | mask_8x8_1) & 1) { - if ((mask_8x8_0 & mask_8x8_1) & 1) { - vpx_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else if (mask_8x8_0 & 1) { - vpx_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); } else { - vpx_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } - if ((mask_4x4_0 | mask_4x4_1) & 1) { - if ((mask_4x4_0 & mask_4x4_1) & 1) { - vpx_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else if (mask_4x4_0 & 1) { - vpx_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr); + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, lfis[0]->lim, + lfis[0]->hev_thr, lfis[1]->mblim, + lfis[1]->lim, lfis[1]->hev_thr); } else { - vpx_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); } } - if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) { - if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) { - vpx_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); - } else if (mask_4x4_int_0 & 1) { - vpx_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr); + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr); } else { - vpx_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, - lfi1->hev_thr); + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr); } } } - s += 8; + ss[0] += 8; lfl += 1; - mask_16x16_0 >>= 1; - mask_8x8_0 >>= 1; - mask_4x4_0 >>= 1; - mask_4x4_int_0 >>= 1; - mask_16x16_1 >>= 1; - mask_8x8_1 >>= 1; - mask_4x4_1 >>= 1; - mask_4x4_int_1 >>= 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; } } #if CONFIG_VP9_HIGHBITDEPTH static void highbd_filter_selectively_vert_row2(int subsampling_factor, uint16_t *s, int pitch, - unsigned int mask_16x16_l, - unsigned int mask_8x8_l, - unsigned int mask_4x4_l, - unsigned int mask_4x4_int_l, - const loop_filter_info_n *lfi_n, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { - const int mask_shift = subsampling_factor ? 4 : 8; - const int mask_cutoff = subsampling_factor ? 0xf : 0xff; + const int dual_mask_cutoff = subsampling_factor ? 0xff : 0xffff; const int lfl_forward = subsampling_factor ? 4 : 8; - - unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff; - unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff; - unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff; - unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff; - unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff; - unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff; - unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff; - unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff; + const unsigned int dual_one = 1 | (1 << lfl_forward); unsigned int mask; - - for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 | - mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; - mask; mask >>= 1) { - const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; - const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); - - if (mask & 1) { - if ((mask_16x16_0 | mask_16x16_1) & 1) { - if ((mask_16x16_0 & mask_16x16_1) & 1) { - vpx_highbd_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); - } else if (mask_16x16_0 & 1) { - vpx_highbd_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); + uint16_t *ss[2]; + ss[0] = s; + + for (mask = + (mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int) & dual_mask_cutoff; + mask; mask = (mask & ~dual_one) >> 1) { + if (mask & dual_one) { + const loop_filter_thresh *lfis[2]; + lfis[0] = lfthr + *lfl; + lfis[1] = lfthr + *(lfl + lfl_forward); + ss[1] = ss[0] + 8 * pitch; + + if (mask_16x16 & dual_one) { + if ((mask_16x16 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_16_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, bd); } else { - vpx_highbd_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + const loop_filter_thresh *lfi = lfis[!(mask_16x16 & 1)]; + vpx_highbd_lpf_vertical_16(ss[!(mask_16x16 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); } } - if ((mask_8x8_0 | mask_8x8_1) & 1) { - if ((mask_8x8_0 & mask_8x8_1) & 1) { - vpx_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } else if (mask_8x8_0 & 1) { - vpx_highbd_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); + if (mask_8x8 & dual_one) { + if ((mask_8x8 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_8_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); } else { - vpx_highbd_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + const loop_filter_thresh *lfi = lfis[!(mask_8x8 & 1)]; + vpx_highbd_lpf_vertical_8(ss[!(mask_8x8 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); } } - if ((mask_4x4_0 | mask_4x4_1) & 1) { - if ((mask_4x4_0 & mask_4x4_1) & 1) { - vpx_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } else if (mask_4x4_0 & 1) { - vpx_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); + if (mask_4x4 & dual_one) { + if ((mask_4x4 & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual(ss[0], pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); } else { - vpx_highbd_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + const loop_filter_thresh *lfi = lfis[!(mask_4x4 & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4 & 1)], pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, bd); } } - if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) { - if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) { - vpx_highbd_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, lfi1->mblim, lfi1->lim, - lfi1->hev_thr, bd); - } else if (mask_4x4_int_0 & 1) { - vpx_highbd_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, - lfi0->hev_thr, bd); + if (mask_4x4_int & dual_one) { + if ((mask_4x4_int & dual_one) == dual_one) { + vpx_highbd_lpf_vertical_4_dual(ss[0] + 4, pitch, lfis[0]->mblim, + lfis[0]->lim, lfis[0]->hev_thr, + lfis[1]->mblim, lfis[1]->lim, + lfis[1]->hev_thr, bd); } else { - vpx_highbd_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, - lfi1->lim, lfi1->hev_thr, bd); + const loop_filter_thresh *lfi = lfis[!(mask_4x4_int & 1)]; + vpx_highbd_lpf_vertical_4(ss[!(mask_4x4_int & 1)] + 4, pitch, + lfi->mblim, lfi->lim, lfi->hev_thr, bd); } } } - s += 8; + ss[0] += 8; lfl += 1; - mask_16x16_0 >>= 1; - mask_8x8_0 >>= 1; - mask_4x4_0 >>= 1; - mask_4x4_int_0 >>= 1; - mask_16x16_1 >>= 1; - mask_8x8_1 >>= 1; - mask_4x4_1 >>= 1; - mask_4x4_int_1 >>= 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -497,17 +469,17 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { unsigned int mask; int count; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= count) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; - count = 1; if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { vpx_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, @@ -520,7 +492,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); vpx_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, @@ -549,7 +521,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); vpx_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, @@ -574,7 +546,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } - } else if (mask_4x4_int & 1) { + } else { vpx_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); } @@ -594,17 +566,17 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { unsigned int mask; int count; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= count) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; - count = 1; if (mask & 1) { + const loop_filter_thresh *lfi = lfthr + *lfl; + if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { vpx_highbd_lpf_horizontal_edge_16(s, pitch, lfi->mblim, lfi->lim, @@ -617,7 +589,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); vpx_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, @@ -650,7 +622,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { // Next block's thresholds. - const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); + const loop_filter_thresh *lfin = lfthr + *(lfl + 1); vpx_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, lfin->mblim, lfin->lim, @@ -679,7 +651,7 @@ static void highbd_filter_selectively_horiz(uint16_t *s, int pitch, lfi->lim, lfi->hev_thr, bd); } } - } else if (mask_4x4_int & 1) { + } else { vpx_highbd_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, bd); } @@ -1079,13 +1051,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, + const loop_filter_thresh *lfthr, const uint8_t *lfl) { unsigned int mask; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= 1) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi = lfthr + *lfl; if (mask & 1) { if (mask_16x16 & 1) { @@ -1113,13 +1085,13 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, unsigned int mask_8x8, unsigned int mask_4x4, unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, + const loop_filter_thresh *lfthr, const uint8_t *lfl, int bd) { unsigned int mask; for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; mask; mask >>= 1) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi = lfthr + *lfl; if (mask & 1) { if (mask_16x16 & 1) { @@ -1250,23 +1222,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, mask_8x8_c & border_mask, mask_4x4_c & border_mask, mask_4x4_int[r], - &cm->lf_info, &lfl[r << 3], + cm->lf_info.lfthr, &lfl[r << 3], (int)cm->bit_depth); } else { +#endif // CONFIG_VP9_HIGHBITDEPTH filter_selectively_vert(dst->buf, dst->stride, mask_16x16_c & border_mask, mask_8x8_c & border_mask, mask_4x4_c & border_mask, mask_4x4_int[r], - &cm->lf_info, &lfl[r << 3]); + cm->lf_info.lfthr, &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - filter_selectively_vert(dst->buf, dst->stride, - mask_16x16_c & border_mask, - mask_8x8_c & border_mask, - mask_4x4_c & border_mask, - mask_4x4_int[r], - &cm->lf_info, &lfl[r << 3]); #endif // CONFIG_VP9_HIGHBITDEPTH dst->buf += 8 * dst->stride; mi_8x8 += row_step_stride; @@ -1299,23 +1266,18 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, mask_8x8_r, mask_4x4_r, mask_4x4_int_r, - &cm->lf_info, &lfl[r << 3], + cm->lf_info.lfthr, &lfl[r << 3], (int)cm->bit_depth); } else { +#endif // CONFIG_VP9_HIGHBITDEPTH filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, mask_4x4_r, mask_4x4_int_r, - &cm->lf_info, &lfl[r << 3]); + cm->lf_info.lfthr, &lfl[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - filter_selectively_horiz(dst->buf, dst->stride, - mask_16x16_r, - mask_8x8_r, - mask_4x4_r, - mask_4x4_int_r, - &cm->lf_info, &lfl[r << 3]); #endif // CONFIG_VP9_HIGHBITDEPTH dst->buf += 8 * dst->stride; } @@ -1337,27 +1299,29 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm, // Vertical pass: do 2 rows at one time for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { - unsigned int mask_16x16_l = mask_16x16 & 0xffff; - unsigned int mask_8x8_l = mask_8x8 & 0xffff; - unsigned int mask_4x4_l = mask_4x4 & 0xffff; - unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff; - -// Disable filtering on the leftmost column. + // Disable filtering on the leftmost column. #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { - highbd_filter_selectively_vert_row2( - plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, - mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info, - &lfm->lfl_y[r << 3], (int)cm->bit_depth); + highbd_filter_selectively_vert_row2(plane->subsampling_x, + CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, + &lfm->lfl_y[r << 3], + (int)cm->bit_depth); } else { - filter_selectively_vert_row2( - plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l, - mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]); +#endif // CONFIG_VP9_HIGHBITDEPTH + filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - filter_selectively_vert_row2( - plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l, - mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]); #endif // CONFIG_VP9_HIGHBITDEPTH dst->buf += 16 * dst->stride; mask_16x16 >>= 16; @@ -1390,19 +1354,18 @@ void vp9_filter_block_plane_ss00(VP9_COMMON *const cm, #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { - highbd_filter_selectively_horiz( - CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3], - (int)cm->bit_depth); + highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, mask_16x16_r, mask_8x8_r, + mask_4x4_r, mask_4x4_int & 0xff, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3], + (int)cm->bit_depth); } else { +#endif // CONFIG_VP9_HIGHBITDEPTH filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, - &lfm->lfl_y[r << 3]); + mask_4x4_r, mask_4x4_int & 0xff, + cm->lf_info.lfthr, &lfm->lfl_y[r << 3]); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, - &lfm->lfl_y[r << 3]); #endif // CONFIG_VP9_HIGHBITDEPTH dst->buf += 8 * dst->stride; @@ -1436,38 +1399,35 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)]; } - { - unsigned int mask_16x16_l = mask_16x16 & 0xff; - unsigned int mask_8x8_l = mask_8x8 & 0xff; - unsigned int mask_4x4_l = mask_4x4 & 0xff; - unsigned int mask_4x4_int_l = mask_4x4_int & 0xff; - -// Disable filtering on the leftmost column. + // Disable filtering on the leftmost column. #if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - highbd_filter_selectively_vert_row2( - plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, - mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info, - &lfl_uv[r << 1], (int)cm->bit_depth); - } else { - filter_selectively_vert_row2( - plane->subsampling_x, dst->buf, dst->stride, - mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info, - &lfl_uv[r << 1]); - } -#else - filter_selectively_vert_row2( - plane->subsampling_x, dst->buf, dst->stride, - mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info, - &lfl_uv[r << 1]); + if (cm->use_highbitdepth) { + highbd_filter_selectively_vert_row2(plane->subsampling_x, + CONVERT_TO_SHORTPTR(dst->buf), + dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfl_uv[r << 1], + (int)cm->bit_depth); + } else { #endif // CONFIG_VP9_HIGHBITDEPTH - - dst->buf += 16 * dst->stride; - mask_16x16 >>= 8; - mask_8x8 >>= 8; - mask_4x4 >>= 8; - mask_4x4_int >>= 8; + filter_selectively_vert_row2(plane->subsampling_x, dst->buf, dst->stride, + (unsigned int)mask_16x16, + (unsigned int)mask_8x8, + (unsigned int)mask_4x4, + (unsigned int)mask_4x4_int, + cm->lf_info.lfthr, &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH } +#endif // CONFIG_VP9_HIGHBITDEPTH + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; } // Horizontal pass @@ -1499,17 +1459,16 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, if (cm->use_highbitdepth) { highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int_r, &cm->lf_info, - &lfl_uv[r << 1], (int)cm->bit_depth); + mask_4x4_r, mask_4x4_int_r, + cm->lf_info.lfthr, &lfl_uv[r << 1], + (int)cm->bit_depth); } else { +#endif // CONFIG_VP9_HIGHBITDEPTH filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int_r, &cm->lf_info, + mask_4x4_r, mask_4x4_int_r, cm->lf_info.lfthr, &lfl_uv[r << 1]); +#if CONFIG_VP9_HIGHBITDEPTH } -#else - filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r, - mask_4x4_r, mask_4x4_int_r, &cm->lf_info, - &lfl_uv[r << 1]); #endif // CONFIG_VP9_HIGHBITDEPTH dst->buf += 8 * dst->stride; diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index c4d91c825..445785835 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -142,6 +142,7 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, // 129 C D .. W X // 129 E F .. U V // 129 G H .. S T T T T T + // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1. // Get current frame pointer, width and height. if (plane == 0) { @@ -177,7 +178,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, left_col[i] = ref[i * ref_stride - 1]; } } else { - // TODO(Peter): this value should probably change for high bitdepth vpx_memset16(left_col, base + 1, bs); } } @@ -239,7 +239,6 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, vpx_memset16(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width); } - // TODO(Peter) this value should probably change for high bitdepth above_row[-1] = left_available ? above_ref[-1] : (base + 1); } else { /* faster path if the block does not need extension */ @@ -251,13 +250,11 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd, memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0])); else vpx_memset16(above_row + bs, above_row[bs - 1], bs); - // TODO(Peter): this value should probably change for high bitdepth above_row[-1] = left_available ? above_ref[-1] : (base + 1); } } } else { vpx_memset16(above_row, base - 1, bs * 2); - // TODO(Peter): this value should probably change for high bitdepth above_row[-1] = base - 1; } } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index d7f5a2113..7b9869b52 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -245,7 +245,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fht16x16 sse2/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; + specialize qw/vp9_fwht4x4/, "$sse2_x86inc"; } else { add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; specialize qw/vp9_fht4x4 sse2 msa/; @@ -257,7 +257,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fht16x16 sse2 msa/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc"; + specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc"; } # diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 6e21bb194..d63912932 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1339,22 +1339,23 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, // has valid dimensions. for (i = 0; i < REFS_PER_FRAME; ++i) { RefBuffer *const ref_frame = &cm->frame_refs[i]; - has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width, - ref_frame->buf->y_crop_height, - width, height); + has_valid_ref_frame |= (ref_frame->idx != INVALID_IDX && + valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, + width, height)); } if (!has_valid_ref_frame) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Referenced frame has invalid size"); for (i = 0; i < REFS_PER_FRAME; ++i) { RefBuffer *const ref_frame = &cm->frame_refs[i]; - if (!valid_ref_frame_img_fmt( - ref_frame->buf->bit_depth, - ref_frame->buf->subsampling_x, - ref_frame->buf->subsampling_y, - cm->bit_depth, - cm->subsampling_x, - cm->subsampling_y)) + if (ref_frame->idx == INVALID_IDX || + !valid_ref_frame_img_fmt(ref_frame->buf->bit_depth, + ref_frame->buf->subsampling_x, + ref_frame->buf->subsampling_y, + cm->bit_depth, + cm->subsampling_x, + cm->subsampling_y)) vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 147743e8d..bbdfbb823 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -145,6 +145,11 @@ struct macroblock { uint8_t sb_is_skin; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for + // 32x32. + uint8_t variance_low[9]; + void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 67069e7c1..e3570504e 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -747,6 +747,8 @@ static int choose_partitioning(VP9_COMP *cpi, const uint8_t *d; int sp; int dp; + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]}; @@ -771,6 +773,10 @@ static int choose_partitioning(VP9_COMP *cpi, } } + for (i = 0; i < 9; i++) { + x->variance_low[i] = 0; + } + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) @@ -831,8 +837,10 @@ static int choose_partitioning(VP9_COMP *cpi, mi->ref_frame[0] = GOLDEN_FRAME; mi->mv[0].as_int = 0; y_sad = y_sad_g; + ref_frame_partition = GOLDEN_FRAME; } else { x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + ref_frame_partition = LAST_FRAME; } set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); @@ -1073,6 +1081,34 @@ static int choose_partitioning(VP9_COMP *cpi, } } } + + if (cpi->sf.short_circuit_low_temp_var) { + // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was + // selected. + if (ref_frame_partition == LAST_FRAME) { + if (xd->mi[0]->sb_type == BLOCK_64X64 && + vt.part_variances.none.variance < (thresholds[0] >> 1)) { + x->variance_low[0] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_64X32) { + if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2)) + x->variance_low[1] = 1; + if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2)) + x->variance_low[2] = 1; + } else if (xd->mi[0]->sb_type == BLOCK_32X64) { + if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2)) + x->variance_low[3] = 1; + if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2)) + x->variance_low[4] = 1; + } else { + // 32x32 + for (i = 0; i < 4; i++) { + if (!force_split[i + 1] && + vt.split[i].part_variances.none.variance < (thresholds[1] >> 1)) + x->variance_low[i + 5] = 1; + } + } + } + } return 0; } diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index f456f37a1..a70eaea3e 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -236,13 +236,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->duration -= frame->duration; } -// Calculate the linear size relative to a baseline of 1080P -#define BASE_SIZE 2073600.0 // 1920x1080 -static double get_linear_size_factor(const VP9_COMP *cpi) { - const double this_area = cpi->initial_width * cpi->initial_height; - return pow(this_area / BASE_SIZE, 0.5); -} - // Calculate an active area of the image that discounts formatting // bars and partially discounts other 0 energy areas. #define MIN_ACTIVE_AREA 0.5 @@ -1247,14 +1240,15 @@ static double calc_correction_factor(double err_per_mb, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -#define ERR_DIVISOR 100.0 -static int get_twopass_worst_quality(const VP9_COMP *cpi, +#define ERR_DIVISOR 115.0 +static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, double inactive_zone, - int section_target_bandwidth, - double group_weight_factor) { + int section_target_bandwidth) { const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + // Clamp the target rate to VBR min / max limts. const int target_rate = vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth); @@ -1269,7 +1263,7 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); const double av_err_per_mb = section_err / active_mbs; const double speed_term = 1.0 + 0.04 * oxcf->speed; - double ediv_size_correction; + double last_group_rate_err; const int target_norm_bits_per_mb = ((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs; int q; @@ -1278,29 +1272,27 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) is_svc_upper_layer = 1; - // Larger image formats are expected to be a little harder to code - // relatively given the same prediction error score. This in part at - // least relates to the increased size and hence coding overheads of - // motion vectors. Some account of this is made through adjustment of - // the error divisor. - ediv_size_correction = - VPXMAX(0.2, VPXMIN(5.0, get_linear_size_factor(cpi))); - if (ediv_size_correction < 1.0) - ediv_size_correction = -(1.0 / ediv_size_correction); - ediv_size_correction *= 4.0; + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = + VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = + VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = calc_correction_factor(av_err_per_mb, - ERR_DIVISOR - ediv_size_correction, + ERR_DIVISOR, is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, FACTOR_PT_HIGH, q, cpi->common.bit_depth); const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q, - factor * speed_term * group_weight_factor, + factor * speed_term * cpi->twopass.bpm_factor, cpi->common.bit_depth); if (bits_per_mb <= target_norm_bits_per_mb) break; @@ -2115,8 +2107,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { old_boost_score = boost_score; } - twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); - // Was the group length constrained by the requirement for a new KF? rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; @@ -2184,24 +2174,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) / (rc->baseline_gf_interval * (double)cm->mb_rows)); - - int tmp_q; - // rc factor is a weight factor that corrects for local rate control drift. - double rc_factor = 1.0; - if (rc->rate_error_estimate > 0) { - rc_factor = VPXMAX(RC_FACTOR_MIN, - (double)(100 - rc->rate_error_estimate) / 100.0); - } else { - rc_factor = VPXMIN(RC_FACTOR_MAX, - (double)(100 - rc->rate_error_estimate) / 100.0); - } - tmp_q = - get_twopass_worst_quality(cpi, group_av_err, - (group_av_skip_pct + group_av_inactive_zone), - vbr_group_bits_per_frame, - twopass->kfgroup_inter_fraction * rc_factor); + int tmp_q = + get_twopass_worst_quality(cpi, group_av_err, + (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame); twopass->active_worst_quality = - VPXMAX(tmp_q, twopass->active_worst_quality >> 1); + (tmp_q + (twopass->active_worst_quality * 3)) >> 2; } #endif @@ -2243,6 +2221,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default to starting GF groups at normal frame size. cpi->rc.next_frame_size_selector = UNSCALED; } + + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; } // Threshold for use of the lagging second reference frame. High second ref @@ -2580,16 +2562,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, twopass->kf_group_bits); - // Work out the fraction of the kf group bits reserved for the inter frames - // within the group after discounting the bits for the kf itself. - if (twopass->kf_group_bits) { - twopass->kfgroup_inter_fraction = - (double)(twopass->kf_group_bits - kf_bits) / - (double)twopass->kf_group_bits; - } else { - twopass->kfgroup_inter_fraction = 1.0; - } - twopass->kf_group_bits -= kf_bits; // Save the bits to spend on the key frame. @@ -2683,21 +2655,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; - int frames_left; FIRSTPASS_STATS this_frame; int target_rate; LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0; - if (lc != NULL) { - frames_left = (int)(twopass->total_stats.count - - lc->current_video_frame_in_layer); - } else { - frames_left = (int)(twopass->total_stats.count - - cm->current_video_frame); - } - if (!twopass->stats_in) return; @@ -2739,6 +2702,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { twopass->active_worst_quality = cpi->oxcf.cq_level; } else if (cm->current_video_frame == 0 || (lc != NULL && lc->current_video_frame_in_layer == 0)) { + const int frames_left = (int)(twopass->total_stats.count - + ((lc != NULL) ? lc->current_video_frame_in_layer + : cm->current_video_frame)); // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); @@ -2750,10 +2716,17 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { const double section_inactive_zone = (twopass->total_left_stats.inactive_zone_rows * 2) / ((double)cm->mb_rows * section_length); - const int tmp_q = - get_twopass_worst_quality(cpi, section_error, - section_intra_skip + section_inactive_zone, - section_target_bandwidth, DEFAULT_GRP_WEIGHT); + int tmp_q; + + // Initialize bits per macro_block estimate correction factor. + twopass->bpm_factor = 1.0; + // Initiallize actual and target bits counters for ARF groups so that + // at the start we have a neutral bpm adjustment. + twopass->rolling_arf_group_target_bits = 1; + twopass->rolling_arf_group_actual_bits = 1; + + tmp_q = get_twopass_worst_quality(cpi, section_error, + section_intra_skip + section_inactive_zone, section_target_bandwidth); twopass->active_worst_quality = tmp_q; twopass->baseline_active_worst_quality = tmp_q; @@ -2871,6 +2844,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0); + // Target vs actual bits for this arf group. + twopass->rolling_arf_group_target_bits += rc->this_frame_target; + twopass->rolling_arf_group_actual_bits += rc->projected_frame_size; + // Calculate the pct rc error. if (rc->total_actual_bits) { rc->rate_error_estimate = @@ -2892,7 +2869,6 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { // If the rate control is drifting consider adjustment to min or maxq. if ((cpi->oxcf.rc_mode != VPX_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && !cpi->rc.is_src_frame_alt_ref) { const int maxq_adj_limit = rc->worst_quality - twopass->active_worst_quality; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 7eb44fa13..76072884d 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -39,8 +39,6 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif -#define VLOW_MOTION_THRESHOLD 950 - typedef struct { double frame; double weight; @@ -124,14 +122,13 @@ typedef struct { // Error score of frames still to be coded in kf group int64_t kf_group_error_left; - // The fraction for a kf groups total bits allocated to the inter frames - double kfgroup_inter_fraction; + double bpm_factor; + int rolling_arf_group_target_bits; + int rolling_arf_group_actual_bits; int sr_update_lag; - int kf_zeromotion_pct; int last_kfgroup_zeromotion_pct; - int gf_zeromotion_pct; int active_worst_quality; int baseline_active_worst_quality; int extend_minq; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 918b3b10b..554409b74 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1126,34 +1126,38 @@ static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + int force_skip_low_temp_var) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); TileInfo *const tile_info = &tile_data->tile_info; -// TODO(jingning) placeholder for inter-frame non-RD mode decision. + // TODO(jingning) placeholder for inter-frame non-RD mode decision. x->pred_mv_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; -// this needs various further optimizations. to be continued.. + // this needs various further optimizations. to be continued.. if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) + if (cm->use_prev_frame_mvs) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); - else - const_motion[ref_frame] = - mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, - candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, - (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } else { + const_motion[ref_frame] = + mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, + candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, + (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame]); - if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) { + // Early exit for golden frame if force_skip_low_temp_var is set. + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 && + !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) { vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, bsize); } @@ -1266,6 +1270,39 @@ static void recheck_zeromv_after_denoising( } #endif // CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + if (bsize == BLOCK_64X64) { + force_skip_low_temp_var = variance_low[0]; + } else if (bsize == BLOCK_64X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[1]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[2]; + } + } else if (bsize == BLOCK_32X64) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[3]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[4]; + } + } else if (bsize == BLOCK_32X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[5]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[6]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[7]; + } else if ((mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[8]; + } + } + return force_skip_low_temp_var; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, @@ -1323,6 +1360,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int ref_frame_cost[MAX_REF_FRAMES]; int svc_force_zero_mode[3] = {0}; int perform_intra_pred = 1; + int use_golden_nonzeromv = 1; + int force_skip_low_temp_var = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1409,10 +1448,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } + if (cpi->sf.short_circuit_low_temp_var) { + force_skip_low_temp_var = + set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + } + + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) + use_golden_nonzeromv = 0; + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col, - yv12_mb, bsize); + yv12_mb, bsize, force_skip_low_temp_var); } for (idx = 0; idx < RT_INTER_MODES; ++idx) { @@ -1424,6 +1472,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int is_skippable; int this_early_term = 0; PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode; @@ -1442,17 +1491,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (const_motion[ref_frame] && this_mode == NEARMV) continue; + // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) { + continue; + } + if (cpi->use_svc) { if (svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (!(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { + if (!force_skip_low_temp_var && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) @@ -1543,7 +1602,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (this_mode == NEWMV && ref_frame == LAST_FRAME && + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no + // need to compute best_pred_sad which is only used to skip golden NEWMV. + if (use_golden_nonzeromv && this_mode == NEWMV && + ref_frame == LAST_FRAME && frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; const uint8_t * const pre_buf = xd->plane[0].pre[0].buf + @@ -1555,21 +1617,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, x->pred_mv_sad[LAST_FRAME] = best_pred_sad; } - if (cpi->use_svc) { - if (this_mode == NEWMV && ref_frame == GOLDEN_FRAME && - frame_mv[NEWMV][GOLDEN_FRAME].as_int != INVALID_MV) { - const int pre_stride = xd->plane[0].pre[0].stride; - const uint8_t * const pre_buf = xd->plane[0].pre[0].buf + - (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.row >> 3) * pre_stride + - (frame_mv[NEWMV][GOLDEN_FRAME].as_mv.col >> 3); - best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, - x->plane[0].src.stride, - pre_buf, pre_stride); - x->pred_mv_sad[GOLDEN_FRAME] = best_pred_sad; - } - } - - if (this_mode != NEARESTMV && frame_mv[this_mode][ref_frame].as_int == frame_mv[NEARESTMV][ref_frame].as_int) @@ -1795,11 +1842,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } // Perform intra prediction search, if the best SAD is above a certain - // threshold. - if (perform_intra_pred && - ((best_rdc.rdcost == INT64_MAX || - (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize)))) { + // threshold. Skip intra prediction if force_skip_low_temp_var is set. + if (!force_skip_low_temp_var && perform_intra_pred && + (best_rdc.rdcost == INT64_MAX || + (!x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize))) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 }; int i; TX_SIZE best_intra_tx_size = TX_SIZES; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index b8a5e6e7d..6c3f91951 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1160,8 +1160,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if ((cpi->oxcf.rc_mode != VPX_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { + if (cpi->oxcf.rc_mode != VPX_Q) { if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { @@ -1559,12 +1558,13 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { if (cm->current_video_frame > 30 && rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 && rc->avg_size_inter > (5 * rc->avg_frame_bandwidth) >> 1) { - rc->baseline_gf_interval = (3 * rc->baseline_gf_interval) >> 1; + rc->baseline_gf_interval = + VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); } else if (cm->current_video_frame > 30 && rc->avg_frame_low_motion < 20) { // Decrease boost and gf interval for high motion case. rc->gfu_boost = DEFAULT_GF_BOOST >> 1; - rc->baseline_gf_interval = VPXMIN(6, rc->baseline_gf_interval >> 1); + rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1); } adjust_gf_key_frame(cpi); rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -1890,27 +1890,28 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { RATE_CONTROL *const rc = &cpi->rc; int64_t vbr_bits_off_target = rc->vbr_bits_off_target; int max_delta; - double position_factor = 1.0; - - // How far through the clip are we. - // This number is used to damp the per frame rate correction. - // Range 0 - 1.0 - if (cpi->twopass.total_stats.count) { - position_factor = sqrt((double)cpi->common.current_video_frame / - cpi->twopass.total_stats.count); - } - max_delta = (int)(position_factor * - ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); - - // vbr_bits_off_target > 0 means we have extra bits to spend - if (vbr_bits_off_target > 0) { - *this_frame_target += - (vbr_bits_off_target > max_delta) ? max_delta - : (int)vbr_bits_off_target; - } else { - *this_frame_target -= - (vbr_bits_off_target < -max_delta) ? max_delta - : (int)-vbr_bits_off_target; + int frame_window = VPXMIN(16, + ((int)cpi->twopass.total_stats.count - cpi->common.current_video_frame)); + + // Calcluate the adjustment to rate for this frame. + if (frame_window > 0) { + max_delta = (vbr_bits_off_target > 0) + ? (int)(vbr_bits_off_target / frame_window) + : (int)(-vbr_bits_off_target / frame_window); + + max_delta = VPXMIN(max_delta, + ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += + (vbr_bits_off_target > max_delta) ? max_delta + : (int)vbr_bits_off_target; + } else { + *this_frame_target -= + (vbr_bits_off_target < -max_delta) ? max_delta + : (int)-vbr_bits_off_target; + } } // Fast redistribution of bits arising from massive local undershoot. diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 02be3c3f9..0090b4f40 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -429,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; + if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 && + content != VP9E_CONTENT_SCREEN) { + // Enable short circuit for low temporal variance. + sf->short_circuit_low_temp_var = 1; + } } if (speed >= 7) { @@ -554,6 +559,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->default_interp_filter = SWITCHABLE; sf->simple_model_rd_from_var = 0; sf->short_circuit_flat_blocks = 0; + sf->short_circuit_low_temp_var = 0; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 90b32164b..71ff0ac10 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -446,6 +446,10 @@ typedef struct SPEED_FEATURES { // Skip a number of expensive mode evaluations for blocks with zero source // variance. int short_circuit_flat_blocks; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index fa37b6fed..fa37b6fed 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm deleted file mode 100644 index 7a7a6b655..000000000 --- a/vp9/encoder/x86/vp9_dct_mmx.asm +++ /dev/null @@ -1,104 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro TRANSFORM_COLS 0 - paddw m0, m1 - movq m4, m0 - psubw m3, m2 - psubw m4, m3 - psraw m4, 1 - movq m5, m4 - psubw m5, m1 ;b1 - psubw m4, m2 ;c1 - psubw m0, m4 - paddw m3, m5 - ; m0 a0 - SWAP 1, 4 ; m1 c1 - SWAP 2, 3 ; m2 d1 - SWAP 3, 5 ; m3 b1 -%endmacro - -%macro TRANSPOSE_4X4 0 - movq m4, m0 - movq m5, m2 - punpcklwd m4, m1 - punpckhwd m0, m1 - punpcklwd m5, m3 - punpckhwd m2, m3 - movq m1, m4 - movq m3, m0 - punpckldq m1, m5 - punpckhdq m4, m5 - punpckldq m3, m2 - punpckhdq m0, m2 - SWAP 2, 3, 0, 1, 4 -%endmacro - -INIT_MMX mmx -cglobal fwht4x4, 3, 4, 8, input, output, stride - lea r3q, [inputq + strideq*4] - movq m0, [inputq] ;a1 - movq m1, [inputq + strideq*2] ;b1 - movq m2, [r3q] ;c1 - movq m3, [r3q + strideq*2] ;d1 - - TRANSFORM_COLS - TRANSPOSE_4X4 - TRANSFORM_COLS - TRANSPOSE_4X4 - - psllw m0, 2 - psllw m1, 2 - psllw m2, 2 - psllw m3, 2 - -%if CONFIG_VP9_HIGHBITDEPTH - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m0 - pcmpgtw m5, m1 - movq m6, m0 - movq m7, m1 - punpcklwd m0, m4 - punpcklwd m1, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq], m0 - movq [outputq + 8], m6 - movq [outputq + 16], m1 - movq [outputq + 24], m7 - pxor m4, m4 - pxor m5, m5 - pcmpgtw m4, m2 - pcmpgtw m5, m3 - movq m6, m2 - movq m7, m3 - punpcklwd m2, m4 - punpcklwd m3, m5 - punpckhwd m6, m4 - punpckhwd m7, m5 - movq [outputq + 32], m2 - movq [outputq + 40], m6 - movq [outputq + 48], m3 - movq [outputq + 56], m7 -%else - movq [outputq], m0 - movq [outputq + 8], m1 - movq [outputq + 16], m2 - movq [outputq + 24], m3 -%endif - - RET diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm new file mode 100644 index 000000000..d3b2a271b --- /dev/null +++ b/vp9/encoder/x86/vp9_dct_sse2.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%define private_prefix vp9 + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + ; 00 01 02 03 + ; 10 11 12 13 + ; 20 21 22 23 + ; 30 31 32 33 + punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13 + punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33 + mova m1, m0 + punpckldq m0, m2 ; 00 10 20 30 01 11 21 31 + punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33 +%endmacro + +INIT_XMM sse2 +cglobal fwht4x4, 3, 4, 8, input, output, stride +; TODO(linfeng): The duplication with vp10 should be resolved. + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + SWAP 1, 2 + psrldq m1, m0, 8 + psrldq m3, m2, 8 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + +%if CONFIG_VP9_HIGHBITDEPTH + ; sign extension + mova m2, m0 + mova m3, m1 + punpcklwd m0, m0 + punpcklwd m1, m1 + punpckhwd m2, m2 + punpckhwd m3, m3 + psrad m0, 16 + psrad m1, 16 + psrad m2, 16 + psrad m3, 16 + mova [outputq], m0 + mova [outputq + 16], m2 + mova [outputq + 32], m1 + mova [outputq + 48], m3 +%else + mova [outputq], m0 + mova [outputq + 16], m1 +%endif + + RET diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c deleted file mode 100644 index 0bc417fc1..000000000 --- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#if defined(_MSC_VER) -# include <intrin.h> -#endif -#include <emmintrin.h> -#include <smmintrin.h> - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -# define LIKELY(v) __builtin_expect(v, 1) -# define UNLIKELY(v) __builtin_expect(v, 0) -#else -# define LIKELY(v) (v) -# define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, - const int *joint_cost, int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + - comp_cost[0][mv.as_mv.row] + comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = pack_int_mv(mv.as_mv.row - ref->row, - mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO((unsigned)mv_cost(diff, x->nmvjointsadcost, - x->nmvsadcost) * - sad_per_bit, VP9_PROB_COST_SHIFT); -} - -/***************************************************************************** - * This function utilises 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { - const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); - const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); - const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); - const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); - - const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); - - const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); - const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, - center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); - - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + - ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if ARCH_X86_64 - __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - unsigned int best_sad; - - int i; - int j; - int step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - __m128i v_sad_d; - __m128i v_cost_d; - __m128i v_outside_d; - __m128i v_inside_d; - __m128i v_diff_mv_w; -#if ARCH_X86_64 - __m128i v_blocka[2]; -#else - __m128i v_blocka[1]; -#endif - - // Compute the candidate motion vectors - const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); - const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - __m128i v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); - - // If none of them are inside, then move on - if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = _mm_srli_epi32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if ARCH_X86_64 // sizeof(intptr_t) == 8 - // Load the offsets - __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); - __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); - // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, - _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = _mm_and_si128(v_bo32_q, - _mm_unpackhi_epi32(v_inside_d, v_inside_d)); - // Compute the candidate addresses - v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); - v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); -#else // ARCH_X86 // sizeof(intptr_t) == 4 - __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); - v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); - v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); -#endif - } - - fn_ptr->sdx4df(what, what_stride, - (const uint8_t **)&v_blocka[0], in_what_stride, - (uint32_t*)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); - const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); - const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); - const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); - const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); - const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); - const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); - const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); - - // Note: This is a use case for vpgather in AVX2 - const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; - const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; - const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; - const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; - - __m128i v_cost_10_d, v_cost_32_d; - - v_cost_10_d = _mm_cvtsi32_si128(cost0); - v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); - - v_cost_32_d = _mm_cvtsi32_si128(cost2); - v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); - - v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); - } - - // Now add in the joint cost - { - const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, - _mm_setzero_si128()); - const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, - v_joint_cost_0_d, - v_sel_d); - v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, 8) - v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); - v_cost_d = _mm_srai_epi32(v_cost_d, 8); - // Add the cost to the sad - v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); - - // Find the minimum value and index horizontally in v_sad_d - { - // Try speculatively on 16 bits, so we can use the minpos intrinsic - const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); - const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); - - uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); - uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); - - // If the local best value is not saturated, just use it, otherwise - // find the horizontal minimum again the hard way on 32 bits. - // This is executed rarely. - if (UNLIKELY(local_best_sad == 0xffff)) { - __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - - v_loval_d = v_sad_d; - v_loidx_d = _mm_set_epi32(3, 2, 1, 0); - v_hival_d = _mm_srli_si128(v_loval_d, 8); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - v_hival_d = _mm_srli_si128(v_loval_d, 4); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - - local_best_sad = _mm_extract_epi32(v_loval_d, 0); - local_best_idx = _mm_extract_epi32(v_loidx_d, 0); - } - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = _mm_set1_epi32(bmv.as_int); -#if ARCH_X86_64 - v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 2930c23dd..7643b48df 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -96,13 +96,12 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif ifeq ($(CONFIG_USE_X86INC),yes) -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm @@ -117,7 +116,7 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm endif endif -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index ff7533745..ad91eadbe 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -35,10 +35,10 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = WRAPLOW(a1, 8); - op[1] = WRAPLOW(b1, 8); - op[2] = WRAPLOW(c1, 8); - op[3] = WRAPLOW(d1, 8); + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); ip += 4; op += 4; } @@ -76,8 +76,8 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = WRAPLOW(a1, 8); - op[1] = op[2] = op[3] = WRAPLOW(e1, 8); + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); ip = tmp; for (i = 0; i < 4; i++) { @@ -98,18 +98,18 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 - output[0] = WRAPLOW(step[0] + step[3], 8); - output[1] = WRAPLOW(step[1] + step[2], 8); - output[2] = WRAPLOW(step[1] - step[2], 8); - output[3] = WRAPLOW(step[0] - step[3], 8); + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); } void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -141,8 +141,8 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -164,48 +164,48 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 temp1 = (step1[0] + step1[2]) * cospi_16_64; temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], 8); - output[1] = WRAPLOW(step1[1] + step1[6], 8); - output[2] = WRAPLOW(step1[2] + step1[5], 8); - output[3] = WRAPLOW(step1[3] + step1[4], 8); - output[4] = WRAPLOW(step1[3] - step1[4], 8); - output[5] = WRAPLOW(step1[2] - step1[5], 8); - output[6] = WRAPLOW(step1[1] - step1[6], 8); - output[7] = WRAPLOW(step1[0] - step1[7], 8); + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); } void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -236,8 +236,8 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -277,10 +277,10 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); - output[2] = WRAPLOW(dct_const_round_shift(s2), 8); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } void iadst8_c(const tran_low_t *input, tran_low_t *output) { @@ -311,14 +311,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); - x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); // stage 2 s0 = (int)x0; @@ -330,14 +330,14 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); - x0 = WRAPLOW(s0 + s2, 8); - x1 = WRAPLOW(s1 + s3, 8); - x2 = WRAPLOW(s0 - s2, 8); - x3 = WRAPLOW(s1 - s3, 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); // stage 3 s2 = (int)(cospi_16_64 * (x2 + x3)); @@ -345,19 +345,19 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { s6 = (int)(cospi_16_64 * (x6 + x7)); s7 = (int)(cospi_16_64 * (x6 - x7)); - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x4, 8); - output[2] = WRAPLOW(x6, 8); - output[3] = WRAPLOW(-x2, 8); - output[4] = WRAPLOW(x3, 8); - output[5] = WRAPLOW(-x7, 8); - output[6] = WRAPLOW(x5, 8); - output[7] = WRAPLOW(-x1, 8); + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); } void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { @@ -420,23 +420,23 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); // stage 3 step1[0] = step2[0]; @@ -446,109 +446,109 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], 8); - output[1] = WRAPLOW(step2[1] + step2[14], 8); - output[2] = WRAPLOW(step2[2] + step2[13], 8); - output[3] = WRAPLOW(step2[3] + step2[12], 8); - output[4] = WRAPLOW(step2[4] + step2[11], 8); - output[5] = WRAPLOW(step2[5] + step2[10], 8); - output[6] = WRAPLOW(step2[6] + step2[9], 8); - output[7] = WRAPLOW(step2[7] + step2[8], 8); - output[8] = WRAPLOW(step2[7] - step2[8], 8); - output[9] = WRAPLOW(step2[6] - step2[9], 8); - output[10] = WRAPLOW(step2[5] - step2[10], 8); - output[11] = WRAPLOW(step2[4] - step2[11], 8); - output[12] = WRAPLOW(step2[3] - step2[12], 8); - output[13] = WRAPLOW(step2[2] - step2[13], 8); - output[14] = WRAPLOW(step2[1] - step2[14], 8); - output[15] = WRAPLOW(step2[0] - step2[15], 8); + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); } void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, @@ -625,22 +625,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); // stage 2 s0 = x0; @@ -660,22 +660,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = WRAPLOW(s0 + s4, 8); - x1 = WRAPLOW(s1 + s5, 8); - x2 = WRAPLOW(s2 + s6, 8); - x3 = WRAPLOW(s3 + s7, 8); - x4 = WRAPLOW(s0 - s4, 8); - x5 = WRAPLOW(s1 - s5, 8); - x6 = WRAPLOW(s2 - s6, 8); - x7 = WRAPLOW(s3 - s7, 8); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); // stage 3 s0 = x0; @@ -695,22 +695,22 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = WRAPLOW(check_range(s0 + s2), 8); - x1 = WRAPLOW(check_range(s1 + s3), 8); - x2 = WRAPLOW(check_range(s0 - s2), 8); - x3 = WRAPLOW(check_range(s1 - s3), 8); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); - x8 = WRAPLOW(check_range(s8 + s10), 8); - x9 = WRAPLOW(check_range(s9 + s11), 8); - x10 = WRAPLOW(check_range(s8 - s10), 8); - x11 = WRAPLOW(check_range(s9 - s11), 8); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -722,31 +722,31 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(dct_const_round_shift(s2), 8); - x3 = WRAPLOW(dct_const_round_shift(s3), 8); - x6 = WRAPLOW(dct_const_round_shift(s6), 8); - x7 = WRAPLOW(dct_const_round_shift(s7), 8); - x10 = WRAPLOW(dct_const_round_shift(s10), 8); - x11 = WRAPLOW(dct_const_round_shift(s11), 8); - x14 = WRAPLOW(dct_const_round_shift(s14), 8); - x15 = WRAPLOW(dct_const_round_shift(s15), 8); - - output[0] = WRAPLOW(x0, 8); - output[1] = WRAPLOW(-x8, 8); - output[2] = WRAPLOW(x12, 8); - output[3] = WRAPLOW(-x4, 8); - output[4] = WRAPLOW(x6, 8); - output[5] = WRAPLOW(x14, 8); - output[6] = WRAPLOW(x10, 8); - output[7] = WRAPLOW(x2, 8); - output[8] = WRAPLOW(x3, 8); - output[9] = WRAPLOW(x11, 8); - output[10] = WRAPLOW(x15, 8); - output[11] = WRAPLOW(x7, 8); - output[12] = WRAPLOW(x5, 8); - output[13] = WRAPLOW(-x13, 8); - output[14] = WRAPLOW(x9, 8); - output[15] = WRAPLOW(-x1, 8); + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); } void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, @@ -779,8 +779,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -813,43 +813,43 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); // stage 2 step2[0] = step1[0]; @@ -863,40 +863,40 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step2[16] = WRAPLOW(step1[16] + step1[17], 8); - step2[17] = WRAPLOW(step1[16] - step1[17], 8); - step2[18] = WRAPLOW(-step1[18] + step1[19], 8); - step2[19] = WRAPLOW(step1[18] + step1[19], 8); - step2[20] = WRAPLOW(step1[20] + step1[21], 8); - step2[21] = WRAPLOW(step1[20] - step1[21], 8); - step2[22] = WRAPLOW(-step1[22] + step1[23], 8); - step2[23] = WRAPLOW(step1[22] + step1[23], 8); - step2[24] = WRAPLOW(step1[24] + step1[25], 8); - step2[25] = WRAPLOW(step1[24] - step1[25], 8); - step2[26] = WRAPLOW(-step1[26] + step1[27], 8); - step2[27] = WRAPLOW(step1[26] + step1[27], 8); - step2[28] = WRAPLOW(step1[28] + step1[29], 8); - step2[29] = WRAPLOW(step1[28] - step1[29], 8); - step2[30] = WRAPLOW(-step1[30] + step1[31], 8); - step2[31] = WRAPLOW(step1[30] + step1[31], 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); // stage 3 step1[0] = step2[0]; @@ -906,42 +906,42 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); - - step1[8] = WRAPLOW(step2[8] + step2[9], 8); - step1[9] = WRAPLOW(step2[8] - step2[9], 8); - step1[10] = WRAPLOW(-step2[10] + step2[11], 8); - step1[11] = WRAPLOW(step2[10] + step2[11], 8); - step1[12] = WRAPLOW(step2[12] + step2[13], 8); - step1[13] = WRAPLOW(step2[12] - step2[13], 8); - step1[14] = WRAPLOW(-step2[14] + step2[15], 8); - step1[15] = WRAPLOW(step2[14] + step2[15], 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -950,87 +950,87 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); - step2[4] = WRAPLOW(step1[4] + step1[5], 8); - step2[5] = WRAPLOW(step1[4] - step1[5], 8); - step2[6] = WRAPLOW(-step1[6] + step1[7], 8); - step2[7] = WRAPLOW(step1[6] + step1[7], 8); + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = WRAPLOW(step1[16] + step1[19], 8); - step2[17] = WRAPLOW(step1[17] + step1[18], 8); - step2[18] = WRAPLOW(step1[17] - step1[18], 8); - step2[19] = WRAPLOW(step1[16] - step1[19], 8); - step2[20] = WRAPLOW(-step1[20] + step1[23], 8); - step2[21] = WRAPLOW(-step1[21] + step1[22], 8); - step2[22] = WRAPLOW(step1[21] + step1[22], 8); - step2[23] = WRAPLOW(step1[20] + step1[23], 8); - - step2[24] = WRAPLOW(step1[24] + step1[27], 8); - step2[25] = WRAPLOW(step1[25] + step1[26], 8); - step2[26] = WRAPLOW(step1[25] - step1[26], 8); - step2[27] = WRAPLOW(step1[24] - step1[27], 8); - step2[28] = WRAPLOW(-step1[28] + step1[31], 8); - step2[29] = WRAPLOW(-step1[29] + step1[30], 8); - step2[30] = WRAPLOW(step1[29] + step1[30], 8); - step2[31] = WRAPLOW(step1[28] + step1[31], 8); + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], 8); - step1[1] = WRAPLOW(step2[1] + step2[2], 8); - step1[2] = WRAPLOW(step2[1] - step2[2], 8); - step1[3] = WRAPLOW(step2[0] - step2[3], 8); + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], 8); - step1[9] = WRAPLOW(step2[9] + step2[10], 8); - step1[10] = WRAPLOW(step2[9] - step2[10], 8); - step1[11] = WRAPLOW(step2[8] - step2[11], 8); - step1[12] = WRAPLOW(-step2[12] + step2[15], 8); - step1[13] = WRAPLOW(-step2[13] + step2[14], 8); - step1[14] = WRAPLOW(step2[13] + step2[14], 8); - step1[15] = WRAPLOW(step2[12] + step2[15], 8); + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -1039,62 +1039,62 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { step1[31] = step2[31]; // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], 8); - step2[1] = WRAPLOW(step1[1] + step1[6], 8); - step2[2] = WRAPLOW(step1[2] + step1[5], 8); - step2[3] = WRAPLOW(step1[3] + step1[4], 8); - step2[4] = WRAPLOW(step1[3] - step1[4], 8); - step2[5] = WRAPLOW(step1[2] - step1[5], 8); - step2[6] = WRAPLOW(step1[1] - step1[6], 8); - step2[7] = WRAPLOW(step1[0] - step1[7], 8); + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); step2[14] = step1[14]; step2[15] = step1[15]; - step2[16] = WRAPLOW(step1[16] + step1[23], 8); - step2[17] = WRAPLOW(step1[17] + step1[22], 8); - step2[18] = WRAPLOW(step1[18] + step1[21], 8); - step2[19] = WRAPLOW(step1[19] + step1[20], 8); - step2[20] = WRAPLOW(step1[19] - step1[20], 8); - step2[21] = WRAPLOW(step1[18] - step1[21], 8); - step2[22] = WRAPLOW(step1[17] - step1[22], 8); - step2[23] = WRAPLOW(step1[16] - step1[23], 8); - - step2[24] = WRAPLOW(-step1[24] + step1[31], 8); - step2[25] = WRAPLOW(-step1[25] + step1[30], 8); - step2[26] = WRAPLOW(-step1[26] + step1[29], 8); - step2[27] = WRAPLOW(-step1[27] + step1[28], 8); - step2[28] = WRAPLOW(step1[27] + step1[28], 8); - step2[29] = WRAPLOW(step1[26] + step1[29], 8); - step2[30] = WRAPLOW(step1[25] + step1[30], 8); - step2[31] = WRAPLOW(step1[24] + step1[31], 8); + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], 8); - step1[1] = WRAPLOW(step2[1] + step2[14], 8); - step1[2] = WRAPLOW(step2[2] + step2[13], 8); - step1[3] = WRAPLOW(step2[3] + step2[12], 8); - step1[4] = WRAPLOW(step2[4] + step2[11], 8); - step1[5] = WRAPLOW(step2[5] + step2[10], 8); - step1[6] = WRAPLOW(step2[6] + step2[9], 8); - step1[7] = WRAPLOW(step2[7] + step2[8], 8); - step1[8] = WRAPLOW(step2[7] - step2[8], 8); - step1[9] = WRAPLOW(step2[6] - step2[9], 8); - step1[10] = WRAPLOW(step2[5] - step2[10], 8); - step1[11] = WRAPLOW(step2[4] - step2[11], 8); - step1[12] = WRAPLOW(step2[3] - step2[12], 8); - step1[13] = WRAPLOW(step2[2] - step2[13], 8); - step1[14] = WRAPLOW(step2[1] - step2[14], 8); - step1[15] = WRAPLOW(step2[0] - step2[15], 8); + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); step1[16] = step2[16]; step1[17] = step2[17]; @@ -1102,58 +1102,58 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = WRAPLOW(step1[0] + step1[31], 8); - output[1] = WRAPLOW(step1[1] + step1[30], 8); - output[2] = WRAPLOW(step1[2] + step1[29], 8); - output[3] = WRAPLOW(step1[3] + step1[28], 8); - output[4] = WRAPLOW(step1[4] + step1[27], 8); - output[5] = WRAPLOW(step1[5] + step1[26], 8); - output[6] = WRAPLOW(step1[6] + step1[25], 8); - output[7] = WRAPLOW(step1[7] + step1[24], 8); - output[8] = WRAPLOW(step1[8] + step1[23], 8); - output[9] = WRAPLOW(step1[9] + step1[22], 8); - output[10] = WRAPLOW(step1[10] + step1[21], 8); - output[11] = WRAPLOW(step1[11] + step1[20], 8); - output[12] = WRAPLOW(step1[12] + step1[19], 8); - output[13] = WRAPLOW(step1[13] + step1[18], 8); - output[14] = WRAPLOW(step1[14] + step1[17], 8); - output[15] = WRAPLOW(step1[15] + step1[16], 8); - output[16] = WRAPLOW(step1[15] - step1[16], 8); - output[17] = WRAPLOW(step1[14] - step1[17], 8); - output[18] = WRAPLOW(step1[13] - step1[18], 8); - output[19] = WRAPLOW(step1[12] - step1[19], 8); - output[20] = WRAPLOW(step1[11] - step1[20], 8); - output[21] = WRAPLOW(step1[10] - step1[21], 8); - output[22] = WRAPLOW(step1[9] - step1[22], 8); - output[23] = WRAPLOW(step1[8] - step1[23], 8); - output[24] = WRAPLOW(step1[7] - step1[24], 8); - output[25] = WRAPLOW(step1[6] - step1[25], 8); - output[26] = WRAPLOW(step1[5] - step1[26], 8); - output[27] = WRAPLOW(step1[4] - step1[27], 8); - output[28] = WRAPLOW(step1[3] - step1[28], 8); - output[29] = WRAPLOW(step1[2] - step1[29], 8); - output[30] = WRAPLOW(step1[1] - step1[30], 8); - output[31] = WRAPLOW(step1[0] - step1[31], 8); + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); } void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, @@ -1253,8 +1253,8 @@ void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { @@ -1288,10 +1288,10 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, c1 = e1 - c1; a1 -= b1; d1 += c1; - op[0] = WRAPLOW(a1, bd); - op[1] = WRAPLOW(b1, bd); - op[2] = WRAPLOW(c1, bd); - op[3] = WRAPLOW(d1, bd); + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); ip += 4; op += 4; } @@ -1332,8 +1332,8 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, a1 = ip[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; - op[0] = WRAPLOW(a1, bd); - op[1] = op[2] = op[3] = WRAPLOW(e1, bd); + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); ip = tmp; for (i = 0; i < 4; i++) { @@ -1359,18 +1359,18 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 - output[0] = WRAPLOW(step[0] + step[3], bd); - output[1] = WRAPLOW(step[1] + step[2], bd); - output[2] = WRAPLOW(step[1] - step[2], bd); - output[3] = WRAPLOW(step[0] - step[3], bd); + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); } void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1404,11 +1404,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1430,39 +1430,39 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 & stage 3 - even half vpx_highbd_idct4_c(step1, step1, bd); // stage 2 - odd half - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); // stage 3 - odd half step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7], bd); - output[1] = WRAPLOW(step1[1] + step1[6], bd); - output[2] = WRAPLOW(step1[2] + step1[5], bd); - output[3] = WRAPLOW(step1[3] + step1[4], bd); - output[4] = WRAPLOW(step1[3] - step1[4], bd); - output[5] = WRAPLOW(step1[2] - step1[5], bd); - output[6] = WRAPLOW(step1[1] - step1[6], bd); - output[7] = WRAPLOW(step1[0] - step1[7], bd); + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); } void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1496,10 +1496,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -1540,10 +1540,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); - output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); - output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); + output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); } void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -1574,14 +1574,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); // stage 2 s0 = x0; @@ -1593,14 +1593,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1608,19 +1608,19 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x4, bd); - output[2] = WRAPLOW(x6, bd); - output[3] = WRAPLOW(-x2, bd); - output[4] = WRAPLOW(x3, bd); - output[5] = WRAPLOW(-x7, bd); - output[6] = WRAPLOW(x5, bd); - output[7] = WRAPLOW(-x1, bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); } void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1685,23 +1685,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 3 step1[0] = step2[0]; @@ -1711,109 +1711,109 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15], bd); - output[1] = WRAPLOW(step2[1] + step2[14], bd); - output[2] = WRAPLOW(step2[2] + step2[13], bd); - output[3] = WRAPLOW(step2[3] + step2[12], bd); - output[4] = WRAPLOW(step2[4] + step2[11], bd); - output[5] = WRAPLOW(step2[5] + step2[10], bd); - output[6] = WRAPLOW(step2[6] + step2[9], bd); - output[7] = WRAPLOW(step2[7] + step2[8], bd); - output[8] = WRAPLOW(step2[7] - step2[8], bd); - output[9] = WRAPLOW(step2[6] - step2[9], bd); - output[10] = WRAPLOW(step2[5] - step2[10], bd); - output[11] = WRAPLOW(step2[4] - step2[11], bd); - output[12] = WRAPLOW(step2[3] - step2[12], bd); - output[13] = WRAPLOW(step2[2] - step2[13], bd); - output[14] = WRAPLOW(step2[1] - step2[14], bd); - output[15] = WRAPLOW(step2[0] - step2[15], bd); + output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1889,22 +1889,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); - x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; @@ -1924,22 +1924,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - x0 = WRAPLOW(s0 + s4, bd); - x1 = WRAPLOW(s1 + s5, bd); - x2 = WRAPLOW(s2 + s6, bd); - x3 = WRAPLOW(s3 + s7, bd); - x4 = WRAPLOW(s0 - s4, bd); - x5 = WRAPLOW(s1 - s5, bd); - x6 = WRAPLOW(s2 - s6, bd); - x7 = WRAPLOW(s3 - s7, bd); - x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); - x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); // stage 3 s0 = x0; @@ -1959,22 +1959,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - x0 = WRAPLOW(s0 + s2, bd); - x1 = WRAPLOW(s1 + s3, bd); - x2 = WRAPLOW(s0 - s2, bd); - x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); - x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); - x8 = WRAPLOW(s8 + s10, bd); - x9 = WRAPLOW(s9 + s11, bd); - x10 = WRAPLOW(s8 - s10, bd); - x11 = WRAPLOW(s9 - s11, bd); - x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); - x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -1986,31 +1986,31 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); - x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); - x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); - x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); - x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); - x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); - x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); - x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); - - output[0] = WRAPLOW(x0, bd); - output[1] = WRAPLOW(-x8, bd); - output[2] = WRAPLOW(x12, bd); - output[3] = WRAPLOW(-x4, bd); - output[4] = WRAPLOW(x6, bd); - output[5] = WRAPLOW(x14, bd); - output[6] = WRAPLOW(x10, bd); - output[7] = WRAPLOW(x2, bd); - output[8] = WRAPLOW(x3, bd); - output[9] = WRAPLOW(x11, bd); - output[10] = WRAPLOW(x15, bd); - output[11] = WRAPLOW(x7, bd); - output[12] = WRAPLOW(x5, bd); - output[13] = WRAPLOW(-x13, bd); - output[14] = WRAPLOW(x9, bd); - output[15] = WRAPLOW(-x1, bd); + x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); } void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2045,11 +2045,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -2084,43 +2084,43 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); // stage 2 step2[0] = step1[0]; @@ -2134,40 +2134,40 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step2[16] = WRAPLOW(step1[16] + step1[17], bd); - step2[17] = WRAPLOW(step1[16] - step1[17], bd); - step2[18] = WRAPLOW(-step1[18] + step1[19], bd); - step2[19] = WRAPLOW(step1[18] + step1[19], bd); - step2[20] = WRAPLOW(step1[20] + step1[21], bd); - step2[21] = WRAPLOW(step1[20] - step1[21], bd); - step2[22] = WRAPLOW(-step1[22] + step1[23], bd); - step2[23] = WRAPLOW(step1[22] + step1[23], bd); - step2[24] = WRAPLOW(step1[24] + step1[25], bd); - step2[25] = WRAPLOW(step1[24] - step1[25], bd); - step2[26] = WRAPLOW(-step1[26] + step1[27], bd); - step2[27] = WRAPLOW(step1[26] + step1[27], bd); - step2[28] = WRAPLOW(step1[28] + step1[29], bd); - step2[29] = WRAPLOW(step1[28] - step1[29], bd); - step2[30] = WRAPLOW(-step1[30] + step1[31], bd); - step2[31] = WRAPLOW(step1[30] + step1[31], bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); + step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); + step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); + step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); + step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); + step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); // stage 3 step1[0] = step2[0]; @@ -2177,42 +2177,42 @@ static void highbd_idct32_c(const tran_low_t *input, temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - - step1[8] = WRAPLOW(step2[8] + step2[9], bd); - step1[9] = WRAPLOW(step2[8] - step2[9], bd); - step1[10] = WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = WRAPLOW(step2[10] + step2[11], bd); - step1[12] = WRAPLOW(step2[12] + step2[13], bd); - step1[13] = WRAPLOW(step2[12] - step2[13], bd); - step1[14] = WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = WRAPLOW(step2[14] + step2[15], bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); step1[16] = step2[16]; step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2221,87 +2221,87 @@ static void highbd_idct32_c(const tran_low_t *input, // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); - step2[4] = WRAPLOW(step1[4] + step1[5], bd); - step2[5] = WRAPLOW(step1[4] - step1[5], bd); - step2[6] = WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = WRAPLOW(step1[6] + step1[7], bd); + step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); step2[8] = step1[8]; step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; - step2[16] = WRAPLOW(step1[16] + step1[19], bd); - step2[17] = WRAPLOW(step1[17] + step1[18], bd); - step2[18] = WRAPLOW(step1[17] - step1[18], bd); - step2[19] = WRAPLOW(step1[16] - step1[19], bd); - step2[20] = WRAPLOW(-step1[20] + step1[23], bd); - step2[21] = WRAPLOW(-step1[21] + step1[22], bd); - step2[22] = WRAPLOW(step1[21] + step1[22], bd); - step2[23] = WRAPLOW(step1[20] + step1[23], bd); - - step2[24] = WRAPLOW(step1[24] + step1[27], bd); - step2[25] = WRAPLOW(step1[25] + step1[26], bd); - step2[26] = WRAPLOW(step1[25] - step1[26], bd); - step2[27] = WRAPLOW(step1[24] - step1[27], bd); - step2[28] = WRAPLOW(-step1[28] + step1[31], bd); - step2[29] = WRAPLOW(-step1[29] + step1[30], bd); - step2[30] = WRAPLOW(step1[29] + step1[30], bd); - step2[31] = WRAPLOW(step1[28] + step1[31], bd); + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); + step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); + step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); + step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); + step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); + step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3], bd); - step1[1] = WRAPLOW(step2[1] + step2[2], bd); - step1[2] = WRAPLOW(step2[1] - step2[2], bd); - step1[3] = WRAPLOW(step2[0] - step2[3], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[7] = step2[7]; - step1[8] = WRAPLOW(step2[8] + step2[11], bd); - step1[9] = WRAPLOW(step2[9] + step2[10], bd); - step1[10] = WRAPLOW(step2[9] - step2[10], bd); - step1[11] = WRAPLOW(step2[8] - step2[11], bd); - step1[12] = WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = WRAPLOW(step2[13] + step2[14], bd); - step1[15] = WRAPLOW(step2[12] + step2[15], bd); + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2310,62 +2310,62 @@ static void highbd_idct32_c(const tran_low_t *input, step1[31] = step2[31]; // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7], bd); - step2[1] = WRAPLOW(step1[1] + step1[6], bd); - step2[2] = WRAPLOW(step1[2] + step1[5], bd); - step2[3] = WRAPLOW(step1[3] + step1[4], bd); - step2[4] = WRAPLOW(step1[3] - step1[4], bd); - step2[5] = WRAPLOW(step1[2] - step1[5], bd); - step2[6] = WRAPLOW(step1[1] - step1[6], bd); - step2[7] = WRAPLOW(step1[0] - step1[7], bd); + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; - step2[16] = WRAPLOW(step1[16] + step1[23], bd); - step2[17] = WRAPLOW(step1[17] + step1[22], bd); - step2[18] = WRAPLOW(step1[18] + step1[21], bd); - step2[19] = WRAPLOW(step1[19] + step1[20], bd); - step2[20] = WRAPLOW(step1[19] - step1[20], bd); - step2[21] = WRAPLOW(step1[18] - step1[21], bd); - step2[22] = WRAPLOW(step1[17] - step1[22], bd); - step2[23] = WRAPLOW(step1[16] - step1[23], bd); - - step2[24] = WRAPLOW(-step1[24] + step1[31], bd); - step2[25] = WRAPLOW(-step1[25] + step1[30], bd); - step2[26] = WRAPLOW(-step1[26] + step1[29], bd); - step2[27] = WRAPLOW(-step1[27] + step1[28], bd); - step2[28] = WRAPLOW(step1[27] + step1[28], bd); - step2[29] = WRAPLOW(step1[26] + step1[29], bd); - step2[30] = WRAPLOW(step1[25] + step1[30], bd); - step2[31] = WRAPLOW(step1[24] + step1[31], bd); + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); + step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); + step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); + step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); + step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); + step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15], bd); - step1[1] = WRAPLOW(step2[1] + step2[14], bd); - step1[2] = WRAPLOW(step2[2] + step2[13], bd); - step1[3] = WRAPLOW(step2[3] + step2[12], bd); - step1[4] = WRAPLOW(step2[4] + step2[11], bd); - step1[5] = WRAPLOW(step2[5] + step2[10], bd); - step1[6] = WRAPLOW(step2[6] + step2[9], bd); - step1[7] = WRAPLOW(step2[7] + step2[8], bd); - step1[8] = WRAPLOW(step2[7] - step2[8], bd); - step1[9] = WRAPLOW(step2[6] - step2[9], bd); - step1[10] = WRAPLOW(step2[5] - step2[10], bd); - step1[11] = WRAPLOW(step2[4] - step2[11], bd); - step1[12] = WRAPLOW(step2[3] - step2[12], bd); - step1[13] = WRAPLOW(step2[2] - step2[13], bd); - step1[14] = WRAPLOW(step2[1] - step2[14], bd); - step1[15] = WRAPLOW(step2[0] - step2[15], bd); + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); step1[16] = step2[16]; step1[17] = step2[17]; @@ -2373,58 +2373,58 @@ static void highbd_idct32_c(const tran_low_t *input, step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); - step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); + step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; step1[31] = step2[31]; // final stage - output[0] = WRAPLOW(step1[0] + step1[31], bd); - output[1] = WRAPLOW(step1[1] + step1[30], bd); - output[2] = WRAPLOW(step1[2] + step1[29], bd); - output[3] = WRAPLOW(step1[3] + step1[28], bd); - output[4] = WRAPLOW(step1[4] + step1[27], bd); - output[5] = WRAPLOW(step1[5] + step1[26], bd); - output[6] = WRAPLOW(step1[6] + step1[25], bd); - output[7] = WRAPLOW(step1[7] + step1[24], bd); - output[8] = WRAPLOW(step1[8] + step1[23], bd); - output[9] = WRAPLOW(step1[9] + step1[22], bd); - output[10] = WRAPLOW(step1[10] + step1[21], bd); - output[11] = WRAPLOW(step1[11] + step1[20], bd); - output[12] = WRAPLOW(step1[12] + step1[19], bd); - output[13] = WRAPLOW(step1[13] + step1[18], bd); - output[14] = WRAPLOW(step1[14] + step1[17], bd); - output[15] = WRAPLOW(step1[15] + step1[16], bd); - output[16] = WRAPLOW(step1[15] - step1[16], bd); - output[17] = WRAPLOW(step1[14] - step1[17], bd); - output[18] = WRAPLOW(step1[13] - step1[18], bd); - output[19] = WRAPLOW(step1[12] - step1[19], bd); - output[20] = WRAPLOW(step1[11] - step1[20], bd); - output[21] = WRAPLOW(step1[10] - step1[21], bd); - output[22] = WRAPLOW(step1[9] - step1[22], bd); - output[23] = WRAPLOW(step1[8] - step1[23], bd); - output[24] = WRAPLOW(step1[7] - step1[24], bd); - output[25] = WRAPLOW(step1[6] - step1[25], bd); - output[26] = WRAPLOW(step1[5] - step1[26], bd); - output[27] = WRAPLOW(step1[4] - step1[27], bd); - output[28] = WRAPLOW(step1[3] - step1[28], bd); - output[29] = WRAPLOW(step1[2] - step1[29], bd); - output[30] = WRAPLOW(step1[1] - step1[30], bd); - output[31] = WRAPLOW(step1[0] - step1[31], bd); + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); + output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); + output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); + output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); + output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); + output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); + output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); + output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); + output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); + output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); + output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); + output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); + output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); + output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); + output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); + output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); + output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); + output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); + output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); + output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); + output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); + output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); + output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); + output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); + output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); + output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); + output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); + output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); + output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, @@ -2500,9 +2500,9 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = WRAPLOW( - highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); - out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); + tran_low_t out = HIGHBD_WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h index 23588139e..c407dd896 100644 --- a/vpx_dsp/inv_txfm.h +++ b/vpx_dsp/inv_txfm.h @@ -21,7 +21,7 @@ extern "C" { #endif -static INLINE tran_low_t check_range(tran_high_t input) { +static INLINE tran_high_t check_range(tran_high_t input) { #if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid VP9 input streams, intermediate stage coefficients should always // stay within the range of a signed 16 bit integer. Coefficients can go out @@ -32,17 +32,17 @@ static INLINE tran_low_t check_range(tran_high_t input) { assert(INT16_MIN <= input); assert(input <= INT16_MAX); #endif // CONFIG_COEFFICIENT_RANGE_CHECKING - return (tran_low_t)input; + return input; } -static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return check_range(rv); + return (tran_high_t)rv; } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE tran_low_t highbd_check_range(tran_high_t input, - int bd) { +static INLINE tran_high_t highbd_check_range(tran_high_t input, + int bd) { #if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid highbitdepth VP9 streams, intermediate stage coefficients will // stay within the ranges: @@ -56,13 +56,12 @@ static INLINE tran_low_t highbd_check_range(tran_high_t input, (void) int_min; #endif // CONFIG_COEFFICIENT_RANGE_CHECKING (void) bd; - return (tran_low_t)input; + return input; } -static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, - int bd) { +static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return highbd_check_range(rv, bd); + return (tran_high_t)rv; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -83,9 +82,20 @@ static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits -#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd)) -#else -#define WRAPLOW(x, bd) ((int32_t)(x)) + +#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#else // CONFIG_EMULATE_HARDWARE + +#define WRAPLOW(x) ((int32_t)check_range(x)) +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) \ + ((int32_t)highbd_check_range((x), bd)) +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_EMULATE_HARDWARE void idct4_c(const tran_low_t *input, tran_low_t *output); @@ -107,14 +117,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, int bd) { - trans = WRAPLOW(trans, bd); - return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd); + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + trans, bd); } #endif static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { - trans = WRAPLOW(trans, 8); - return clip_pixel(WRAPLOW(dest + trans, 8)); + trans = WRAPLOW(trans); + return clip_pixel(dest + trans); } #ifdef __cplusplus } // extern "C" diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f812d4499..0e2b1a850 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -39,7 +39,6 @@ endif DSP_SRCS-yes += intrapred.c ifeq ($(CONFIG_USE_X86INC),yes) -DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm @@ -56,7 +55,6 @@ endif # CONFIG_VP9_HIGHBITDEPTH ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) DSP_SRCS-yes += add_noise.c DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c -DSP_SRCS-$(HAVE_MMX) += x86/add_noise_mmx.asm DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm endif # CONFIG_POSTPROC @@ -323,8 +321,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c -DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c -DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d5561173b..414428127 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -55,13 +55,13 @@ if ($opts{arch} eq "x86_64") { # add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d207_predictor_4x4/, "$ssse3_x86inc"; +specialize qw/vpx_d207_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d207e_predictor_4x4/; add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_4x4 neon/, "$ssse3_x86inc"; +specialize qw/vpx_d45_predictor_4x4 neon/, "$sse2_x86inc"; add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45e_predictor_4x4/; @@ -118,7 +118,7 @@ add_proto qw/void vpx_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, c specialize qw/vpx_d207e_predictor_8x8/; add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_8x8 neon/, "$ssse3_x86inc"; +specialize qw/vpx_d45_predictor_8x8 neon/, "$sse2_x86inc"; add_proto qw/void vpx_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d45e_predictor_8x8/; @@ -543,7 +543,7 @@ specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon; add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc"; +specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; @@ -564,7 +564,7 @@ specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon; add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc"; +specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/; @@ -1478,25 +1478,25 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -1532,22 +1532,22 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, i specialize qw/vpx_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_avg_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc"; + specialize qw/vpx_sub_pixel_avg_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; # # Specialty Subpixel # add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_h mmx sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/; add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_v mmx sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/; add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_hv mmx sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; @@ -1913,7 +1913,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; - specialize qw/vpx_plane_add_noise mmx sse2 msa/; + specialize qw/vpx_plane_add_noise sse2 msa/; } } # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/x86/add_noise_mmx.asm b/vpx_dsp/x86/add_noise_mmx.asm deleted file mode 100644 index 8c2623db4..000000000 --- a/vpx_dsp/x86/add_noise_mmx.asm +++ /dev/null @@ -1,86 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -;void vpx_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) -global sym(vpx_plane_add_noise_mmx) PRIVATE -sym(vpx_plane_add_noise_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; get the clamps in registers - mov rdx, arg(2) ; blackclamp - movq mm3, [rdx] - mov rdx, arg(3) ; whiteclamp - movq mm4, [rdx] - mov rdx, arg(4) ; bothclamp - movq mm5, [rdx] - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movq mm1,[rsi+rax] ; get the source - - psubusb mm1, mm3 ; subtract black clamp - paddusb mm1, mm5 ; add both clamp - psubusb mm1, mm4 ; subtract whiteclamp - - movq mm2,[rdi+rax] ; get the noise for this line - paddb mm1,mm2 ; add it in - movq [rsi+rax],mm1 ; store the result - - add rax,8 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -Blur: - times 16 dw 16 - times 8 dw 64 - times 16 dw 16 - times 8 dw 0 - -rd: - times 4 dw 0x40 diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm index c24d53686..cd6a6ae98 100644 --- a/vpx_dsp/x86/intrapred_sse2.asm +++ b/vpx_dsp/x86/intrapred_sse2.asm @@ -11,6 +11,7 @@ %include "third_party/x86inc/x86inc.asm" SECTION_RODATA +pb_1: times 16 db 1 pw_4: times 8 dw 4 pw_8: times 8 dw 8 pw_16: times 8 dw 16 @@ -23,6 +24,115 @@ pw2_32: times 8 dw 16 SECTION .text +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + DEFINE_ARGS dst, stride, temp + psrldq m1, m0, 1 + psrldq m2, m0, 2 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + + ; store 4 lines + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + lea dstq, [dstq+strideq*2] + psrlq m3, 8 + movd [dstq ], m3 + psrlq m3, 8 + movd [dstq+strideq ], m3 + psrlq m0, 56 + movd tempq, m0 + mov [dstq+strideq+3], tempb + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset + GET_GOT goffsetq + + movu m1, [aboveq] + pslldq m0, m1, 1 + psrldq m2, m1, 1 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 + punpckhbw m0, m0 ; 7 7 + punpcklwd m0, m0 ; 7 7 7 7 + punpckldq m0, m0 ; 7 7 7 7 7 7 7 7 + punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7 + + ; store 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + lea dstq, [dstq+strideq*4] + + ; store next 4 lines + psrldq m3, 1 + movq [dstq ], m3 + psrldq m3, 1 + movq [dstq+strideq ], m3 + psrldq m3, 1 + movq [dstq+strideq*2], m3 + psrldq m3, 1 + movq [dstq+stride3q ], m3 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset + GET_GOT goffsetq + + movd m0, [leftq] ; abcd [byte] + punpcklbw m4, m0, m0 ; aabb ccdd + punpcklwd m4, m4 ; aaaa bbbb cccc dddd + psrldq m4, 12 ; dddd + punpckldq m0, m4 ; abcd dddd + psrldq m1, m0, 1 ; bcdd + psrldq m2, m0, 2 ; cddd + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d + pavgb m1, m0 ; ab, bc, cd, d [byte] + + punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d + movd [dstq ], m1 + psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d + movd [dstq+strideq], m1 + + lea dstq, [dstq+strideq*2] + psrlq m1, 16 ; cd, c3d, d, d + movd [dstq ], m1 + movd [dstq+strideq], m4 ; d, d, d, d + RESTORE_GOT + RET + INIT_XMM sse2 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm index d061278c7..5e0139fa8 100644 --- a/vpx_dsp/x86/intrapred_ssse3.asm +++ b/vpx_dsp/x86/intrapred_ssse3.asm @@ -13,7 +13,6 @@ SECTION_RODATA pb_1: times 16 db 1 -sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 @@ -28,77 +27,9 @@ sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 SECTION .text -INIT_MMX ssse3 -cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - pshufb m2, m0, [GLOBAL(sh_b23456777)] - pshufb m1, m0, [GLOBAL(sh_b01234577)] - pshufb m0, [GLOBAL(sh_b12345677)] - pavgb m3, m2, m1 - pxor m2, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; store 4 lines - movd [dstq ], m0 - psrlq m0, 8 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - psrlq m0, 8 - movd [dstq ], m0 - psrlq m0, 8 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_MMX ssse3 -cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - mova m1, [GLOBAL(sh_b12345677)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m2, m0, [GLOBAL(sh_b23456777)] - pavgb m3, m2, m0 - pxor m2, m0 - pshufb m0, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; store 4 lines - movq [dstq ], m0 - pshufb m0, m1 - movq [dstq+strideq ], m0 - pshufb m0, m1 - movq [dstq+strideq*2], m0 - pshufb m0, m1 - movq [dstq+stride3q ], m0 - pshufb m0, m1 - lea dstq, [dstq+strideq*4] - - ; store next 4 lines - movq [dstq ], m0 - pshufb m0, m1 - movq [dstq+strideq ], m0 - pshufb m0, m1 - movq [dstq+strideq*2], m0 - pshufb m0, m1 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - INIT_XMM ssse3 cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset GET_GOT goffsetq @@ -715,28 +646,6 @@ cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset RESTORE_GOT RET -INIT_MMX ssse3 -cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; abcd [byte] - pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] - pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 - pavgb m1, m0 ; ab, bc, cd, d [byte] - - punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d - movd [dstq ], m1 - psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d - movd [dstq+strideq], m1 - lea dstq, [dstq+strideq*2] - psrlq m1, 16 ; cd, c3d, d, d - movd [dstq ], m1 - pshufw m1, m1, q1111 ; d, d, d, d - movd [dstq+strideq], m1 - RESTORE_GOT - RET - INIT_XMM ssse3 cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset GET_GOT goffsetq diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm deleted file mode 100644 index 45d0ecc0d..000000000 --- a/vpx_dsp/x86/loopfilter_mmx.asm +++ /dev/null @@ -1,436 +0,0 @@ -; -; Copyright (c) 2016 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -tfe: - times 8 db 0xfe -t80: - times 8 db 0x80 -t3: - times 8 db 0x03 -t4: - times 8 db 0x04 -ones: - times 4 dw 0x0001 - -SECTION .text - -%define stkreg rsp - -%define t0 0 -%define t1 t0 + 16 -%define p1 t1 + 16 -%define p0 p1 + 16 -%define q0 p0 + 16 -%define q1 q0 + 16 -%define lstacksize q1 + 16 - -%define goffsetq _limitq - -;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int src_pixel_step, -; const char *blimit, const char *limit, -; const char *thresh); -INIT_MMX mmx -cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \ - s, p, _blimit, _limit, _thresh, s1 - movq m7, [_limitq] - GET_GOT goffsetq -%if GET_GOT_DEFINED=1 - add rsp, gprsize ; restore stack -%endif - lea s1q, [sq + pq] ; s1q points to row +1 - - ; calculate breakout conditions - movq m2, [s1q + 2 * pq] ; q3 - movq m1, [ sq + 2 * pq] ; q2 - movq m6, m1 ; q2 - psubusb m1, m2 ; q2-=q3 - psubusb m2, m6 ; q3-=q2 - por m1, m2 ; abs(q3-q2) - psubusb m1, m7 - movq m4, [sq + pq] ; q1 - movq m3, m4 ; q1 - psubusb m4, m6 ; q1-=q2 - psubusb m6, m3 ; q2-=q1 - por m4, m6 ; abs(q2-q1) - psubusb m4, m7 - por m1, m4 - movq m4, [sq] ; q0 - movq m0, m4 ; q0 - psubusb m4, m3 ; q0-=q1 - psubusb m3, m0 ; q1-=q0 - por m4, m3 ; abs(q0-q1) - movq [stkreg + t0], m4 ; save to t0 - psubusb m4, m7 - por m1, m4 - neg pq ; negate pitch to deal with - ; above border - movq m2, [ sq + 4 * pq] ; p3 - movq m4, [s1q + 4 * pq] ; p2 - movq m5, m4 ; p2 - psubusb m4, m2 ; p2-=p3 - psubusb m2, m5 ; p3-=p2 - por m4, m2 ; abs(p3 - p2) - psubusb m4, m7 - por m1, m4 - movq m4, [sq + 2 * pq] ; p1 - movq m3, m4 ; p1 - psubusb m4, m5 ; p1-=p2 - psubusb m5, m3 ; p2-=p1 - por m4, m5 ; abs(p2 - p1) - psubusb m4, m7 - por m1, m4 - movq m2, m3 ; p1 - movq m4, [sq + pq] ; p0 - movq m5, m4 ; p0 - psubusb m4, m3 ; p0-=p1 - psubusb m3, m5 ; p1-=p0 - por m4, m3 ; abs(p1 - p0) - movq [stkreg + t1], m4 ; save to t1 - psubusb m4, m7 - por m1, m4 - movq m3, [s1q] ; q1 - movq m4, m3 ; q1 - psubusb m3, m2 ; q1-=p1 - psubusb m2, m4 ; p1-=q1 - por m2, m3 ; abs(p1-q1) - pand m2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw m2, 1 ; abs(p1-q1)/2 - movq m6, m5 ; p0 - movq m3, [sq] ; q0 - psubusb m5, m3 ; p0-=q0 - psubusb m3, m6 ; q0-=p0 - por m5, m3 ; abs(p0 - q0) - paddusb m5, m5 ; abs(p0-q0)*2 - paddusb m5, m2 ; abs (p0 - q0) * 2 + abs(p1-q1)/2 - movq m7, [_blimitq] ; blimit - psubusb m5, m7 ; abs (p0 - q0) * 2 + - ; abs(p1-q1)/2 > blimit - por m1, m5 - pxor m5, m5 - pcmpeqb m1, m5 ; mask m1 - - ; calculate high edge variance - movq m7, [_threshq] - movq m4, [stkreg + t0] ; get abs (q1 - q0) - psubusb m4, m7 - movq m3, [stkreg + t1] ; get abs (p1 - p0) - psubusb m3, m7 - paddb m4, m3 ; abs(q1 - q0) > thresh || - ; abs(p1 - p0) > thresh - pcmpeqb m4, m5 - pcmpeqb m5, m5 - movq m3, [GLOBAL(t80)] - pxor m4, m5 - - ; start work on filters - movq m2, [sq + 2 * pq] ; p1 - movq m7, [s1q] ; q1 - pxor m2, m3 ; p1 converted to signed values - pxor m7, m3 ; q1 converted to signed values - psubsb m2, m7 ; p1 - q1 - pand m2, m4 ; high var mask (hvm)(p1 - q1) - pxor m6, m3 ; p0 converted to signed values - pxor m0, m3 ; q0 converted to signed values - movq m3, m0 ; q0 - psubsb m0, m6 ; q0 - p0 - paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand m1, m2 ; mask filter values we don't - ; care about - movq m2, m1 - paddsb m1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb m2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - pxor m0, m0 - pxor m5, m5 - punpcklbw m0, m2 - punpckhbw m5, m2 - psraw m0, 11 - psraw m5, 11 - packsswb m0, m5 - movq m2, m0 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 3) >> 3; - pxor m0, m0 - movq m5, m1 ; abcdefgh - punpcklbw m0, m1 ; e0f0g0h0 - psraw m0, 11 ; sign extended shift right by 3 - pxor m1, m1 - punpckhbw m1, m5 ; a0b0c0d0 - psraw m1, 11 ; sign extended shift right by 3 - movq m5, m0 ; save results - - packsswb m0, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>3 - paddsw m5, [GLOBAL(ones)] - paddsw m1, [GLOBAL(ones)] - psraw m5, 1 - psraw m1, 1 - packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>4 - movq m1, [GLOBAL(t80)] - pandn m4, m5 ; high edge variance additive - paddsb m6, m2 ; p0+= p0 add - pxor m6, m1 ; unoffset - movq [sq + pq], m6 ; write back - movq m6, [sq + 2 * pq] ; p1 - pxor m6, m1 ; reoffset - paddsb m6, m4 ; p1+= p1 add - pxor m6, m1 ; unoffset - movq [sq + 2 * pq], m6 ; write back - psubsb m3, m0 ; q0-= q0 add - pxor m3, m1 ; unoffset - movq [sq], m3 ; write back - psubsb m7, m4 ; q1-= q1 add - pxor m7, m1 ; unoffset - movq [s1q], m7 ; write back - RET - -;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int src_pixel_step, -; const char *blimit, const char *limit, -; const char *thresh); -INIT_MMX mmx -cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \ - s, p, _blimit, _limit, _thresh, s1 - lea sq, [sq + pq * 4 - 4] - lea s1q, [sq + pq] ; s1q points to row +1 - ;transpose - movq m6, [ sq + 2 * pq] ; 67 66 65 64 63 62 61 60 - movq m7, m6 ; 77 76 75 74 73 72 71 70 - punpckhbw m7, [s1q + 2 * pq] ; 77 67 76 66 75 65 74 64 - punpcklbw m6, [s1q + 2 * pq] ; 73 63 72 62 71 61 70 60 - movq m4, [sq] ; 47 46 45 44 43 42 41 40 - movq m5, m4 ; 47 46 45 44 43 42 41 40 - punpckhbw m5, [sq + pq] ; 57 47 56 46 55 45 54 44 - punpcklbw m4, [sq + pq] ; 53 43 52 42 51 41 50 40 - movq m3, m5 ; 57 47 56 46 55 45 54 44 - punpckhwd m5, m7 ; 77 67 57 47 76 66 56 46 - punpcklwd m3, m7 ; 75 65 55 45 74 64 54 44 - movq m2, m4 ; 53 43 52 42 51 41 50 40 - punpckhwd m4, m6 ; 73 63 53 43 72 62 52 42 - punpcklwd m2, m6 ; 71 61 51 41 70 60 50 40 - neg pq - movq m6, [ sq + pq * 2] ; 27 26 25 24 23 22 21 20 - movq m1, m6 ; 27 26 25 24 23 22 21 20 - punpckhbw m6, [ sq + pq ] ; 37 27 36 36 35 25 34 24 - punpcklbw m1, [ sq + pq ] ; 33 23 32 22 31 21 30 20 - movq m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00 - punpckhbw m7, [s1q + pq * 4] ; 17 07 16 06 15 05 14 04 - movq m0, m7 ; 17 07 16 06 15 05 14 04 - punpckhwd m7, m6 ; 37 27 17 07 36 26 16 06 - punpcklwd m0, m6 ; 35 25 15 05 34 24 14 04 - movq m6, m7 ; 37 27 17 07 36 26 16 06 - punpckhdq m7, m5 ; 77 67 57 47 37 27 17 07 = q3 - punpckldq m6, m5 ; 76 66 56 46 36 26 16 06 = q2 - movq m5, m6 ; 76 66 56 46 36 26 16 06 - psubusb m5, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m5; ; m7=abs (q3-q2) - movq m5, m0 ; 35 25 15 05 34 24 14 04 - punpckhdq m5, m3 ; 75 65 55 45 35 25 15 05 = q1 - punpckldq m0, m3 ; 74 64 54 44 34 24 15 04 = q0 - movq m3, m5 ; 75 65 55 45 35 25 15 05 = q1 - psubusb m3, m6 ; q1-q2 - psubusb m6, m5 ; q2-q1 - por m6, m3 ; m6=abs(q2-q1) - - movq [stkreg + q1], m5 ; save q1 - movq [stkreg + q0], m0 ; save q0 - - movq m3, [ sq + pq * 4] ; 07 06 05 04 03 02 01 00 - punpcklbw m3, [s1q + pq * 4] ; 13 03 12 02 11 01 10 00 - movq m0, m3 ; 13 03 12 02 11 01 10 00 - punpcklwd m0, m1 ; 31 21 11 01 30 20 10 00 - punpckhwd m3, m1 ; 33 23 13 03 32 22 12 02 - movq m1, m0 ; 31 21 11 01 30 20 10 00 - punpckldq m0, m2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq m1, m2 ; 71 61 51 41 31 21 11 01 =p2 - movq m2, m1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb m2, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m2 ; m0=abs(p3-p2) - movq m2, m3 ; 33 23 13 03 32 22 12 02 - punpckldq m2, m4 ; 72 62 52 42 32 22 12 02 = p1 - punpckhdq m3, m4 ; 73 63 53 43 33 23 13 03 = p0 - - movq [stkreg + p0], m3 ; save p0 - movq [stkreg + p1], m2 ; save p1 - movq m5, m2 ; m5 = p1 - psubusb m2, m1 ; p1-p2 - psubusb m1, m5 ; p2-p1 - por m1, m2 ; m1=abs(p2-p1) - movq m4, [_limitq] - GET_GOT goffsetq -%if GET_GOT_DEFINED=1 - add rsp, gprsize ; restore stack -%endif - psubusb m7, m4 - psubusb m0, m4 - psubusb m1, m4 - psubusb m6, m4 - por m7, m6 - por m0, m1 - por m0, m7 ; abs(q3-q2) > limit || - ; abs(p3-p2) > limit || - ; abs(p2-p1) > limit || - ; abs(q2-q1) > limit - movq m1, m5 ; p1 - movq m7, m3 ; m3=m7=p0 - psubusb m7, m5 ; p0 - p1 - psubusb m5, m3 ; p1 - p0 - por m5, m7 ; abs(p1-p0) - movq [stkreg + t0], m5 ; save abs(p1-p0) - psubusb m5, m4 - por m0, m5 ; m0=mask - movq m5, [stkreg + q0] ; m5=q0 - movq m7, [stkreg + q1] ; m7=q1 - movq m6, m5 ; m6=q0 - movq m2, m7 ; q1 - psubusb m5, m7 ; q0-q1 - psubusb m7, m6 ; q1-q0 - por m7, m5 ; abs(q1-q0) - movq [stkreg + t1], m7 ; save abs(q1-q0) - psubusb m7, m4 - por m0, m7 ; mask - movq m5, m2 ; q1 - psubusb m5, m1 ; q1-=p1 - psubusb m1, m2 ; p1-=q1 - por m5, m1 ; abs(p1-q1) - pand m5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw m5, 1 ; abs(p1-q1)/2 - movq m4, [_blimitq] - movq m1, m3 ; m1=m3=p0 - movq m7, m6 ; m7=m6=q0 - psubusb m1, m7 ; p0-q0 - psubusb m7, m3 ; q0-p0 - por m1, m7 ; abs(q0-p0) - paddusb m1, m1 ; abs(q0-p0)*2 - paddusb m1, m5 ; abs(p0 - q0)*2 + abs(p1-q1)/2 - psubusb m1, m4 ; abs(p0 - q0)*2 + abs(p1-q1)/2 - ; > blimit - por m1, m0; ; mask - pxor m0, m0 - pcmpeqb m1, m0 - - ; calculate high edge variance - movq m7, [_threshq] - movq m4, [stkreg + t0] ; get abs (q1 - q0) - psubusb m4, m7 - movq m3, [stkreg + t1] ; get abs (p1 - p0) - psubusb m3, m7 - por m4, m3 ; abs(q1 - q0) > thresh || - ; abs(p1 - p0) > thresh - pcmpeqb m4, m0 - pcmpeqb m0, m0 - movq m3, [GLOBAL(t80)] - pxor m4, m0 - - ; start work on filters - movq m2, [stkreg + p1] - movq m7, [stkreg + q1] - movq m6, [stkreg + p0] - movq m0, [stkreg + q0] - pxor m2, m3 - pxor m7, m3 - psubsb m2, m7 ; p1 - q1 - pand m2, m4 ; high var mask (hvm)(p1 - q1) - pxor m6, m3 - pxor m0, m3 - movq m3, m0 ; q0 - psubsb m0, m6 ; q0 - p0 - paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand m1, m2 ; mask filter values we don't - ; care about - movq m2, m1 - paddsb m1, [GLOBAL(t4)] ; 3*(q0 - p0) + hvm(p1 - q1) + 4 - paddsb m2, [GLOBAL(t3)] ; 3*(q0 - p0) + hvm(p1 - q1) + 3 - pxor m0, m0 - pxor m5, m5 - punpcklbw m0, m2 - punpckhbw m5, m2 - psraw m0, 11 - psraw m5, 11 - packsswb m0, m5 - movq m2, m0 ; (3*(q0 - p0) + hvm(p1 - q1) - ; + 3) >> 3; - pxor m0, m0 - movq m5, m1 ; abcdefgh - punpcklbw m0, m1 ; e0f0g0h0 - psraw m0, 11 ; sign extended shift right by 3 - pxor m1, m1 - punpckhbw m1, m5 ; a0b0c0d0 - psraw m1, 11 ; sign extended shift right by 3 - movq m5, m0 ; save results - packsswb m0, m1 ; (3*(q0 - p0) + hvm(p1 - q1) - ; + 4) >>3 - paddsw m5, [GLOBAL(ones)] - paddsw m1, [GLOBAL(ones)] - psraw m5, 1 - psraw m1, 1 - packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>4 - pandn m4, m5 ; high edge variance additive - movq m5, [GLOBAL(t80)] - paddsb m6, m2 ; p0+= p0 add - pxor m6, m5 ; unoffset - ; m6=p0 - movq m1, [stkreg + p1] - pxor m1, m5 ; reoffset - paddsb m1, m4 ; p1+= p1 add - pxor m1, m5 ; unoffset - ; m6 = p0 m1 = p1 - psubsb m3, m0 ; q0-= q0 add - pxor m3, m5 ; unoffset - ; m3 = q0 - psubsb m7, m4 ; q1-= q1 add - pxor m7, m5 ; unoffset - ; m7 = q1 - ; transpose and write back - ; m1 = 72 62 52 42 32 22 12 02 - ; m6 = 73 63 53 43 33 23 13 03 - ; m3 = 74 64 54 44 34 24 14 04 - ; m7 = 75 65 55 45 35 25 15 05 - movq m2, m1 ; 72 62 52 42 32 22 12 02 - punpcklbw m2, m6 ; 33 32 23 22 13 12 03 02 - movq m4, m3 ; 74 64 54 44 34 24 14 04 - punpckhbw m1, m6 ; 73 72 63 62 53 52 43 42 - punpcklbw m4, m7 ; 35 34 25 24 15 14 05 04 - punpckhbw m3, m7 ; 75 74 65 64 55 54 45 44 - movq m6, m2 ; 33 32 23 22 13 12 03 02 - punpcklwd m2, m4 ; 15 14 13 12 05 04 03 02 - punpckhwd m6, m4 ; 35 34 33 32 25 24 23 22 - movq m5, m1 ; 73 72 63 62 53 52 43 42 - punpcklwd m1, m3 ; 55 54 53 52 45 44 43 42 - punpckhwd m5, m3 ; 75 74 73 72 65 64 63 62 - - ; m2 = 15 14 13 12 05 04 03 02 - ; m6 = 35 34 33 32 25 24 23 22 - ; m5 = 55 54 53 52 45 44 43 42 - ; m1 = 75 74 73 72 65 64 63 62 - movd [sq + pq * 4 + 2], m2 - psrlq m2, 32 - movd [s1q + pq * 4 + 2], m2 - movd [sq + pq * 2 + 2], m6 - psrlq m6, 32 - movd [sq + pq + 2], m6 - movd [sq + 2], m1 - psrlq m1, 32 - movd [s1q + 2], m1 - neg pq - movd [s1q + pq + 2], m5 - psrlq m5, 32 - movd [s1q + pq * 2 + 2], m5 - RET diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index e03508a03..39a6ae3a8 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -18,6 +18,213 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } +// filter_mask and hev_mask +#define FILTER_HEV_MASK do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ +} while (0) + +#define FILTER4 do { \ + const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \ + 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ +} while (0) + +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s - 4 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s + 0 * p))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + + *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +} + void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm index be359759c..cee4468c1 100644 --- a/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/vpx_dsp/x86/subpel_variance_sse2.asm @@ -57,8 +57,8 @@ SECTION .text paddd %6, %1 %endmacro -%macro STORE_AND_RET 0 -%if mmsize == 16 +%macro STORE_AND_RET 1 +%if %1 > 4 ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. ; We have to sign-extend it before adding the words within the register @@ -78,16 +78,16 @@ SECTION .text movd [r1], m7 ; store sse paddd m6, m4 movd raxd, m6 ; store sum as return value -%else ; mmsize == 8 - pshufw m4, m6, 0xe - pshufw m3, m7, 0xe +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe paddw m6, m4 paddd m7, m3 pcmpgtw m5, m6 ; mask for 0 > x mov r1, ssem ; r1 = unsigned int *sse punpcklwd m6, m5 ; sign-extend m6 word->dword movd [r1], m7 ; store sse - pshufw m4, m6, 0xe + pshuflw m4, m6, 0xe paddd m6, m4 movd raxd, m6 ; store sum as return value %endif @@ -196,6 +196,12 @@ SECTION .text %endif %endif +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 pxor m6, m6 ; sum pxor m7, m7 ; sse @@ -228,6 +234,7 @@ SECTION .text %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 + %if %2 == 0 ; !avg punpckhbw m3, m1, m5 punpcklbw m1, m5 @@ -237,24 +244,37 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] + movx m0, [srcq] %if %2 == 1 ; avg -%if mmsize == 16 +%if %1 > 4 movhps m0, [srcq+src_strideq] -%else ; mmsize == 8 - punpckldq m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 %endif %else ; !avg - movh m2, [srcq+src_strideq] + movx m2, [srcq+src_strideq] %endif - movh m1, [dstq] - movh m3, [dstq+dst_strideq] + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + %if %2 == 1 ; avg +%if %1 > 4 pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif punpcklbw m3, m5 punpcklbw m1, m5 +%if %1 > 4 punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif %else ; !avg punpcklbw m0, m5 punpcklbw m2, m5 @@ -271,7 +291,7 @@ SECTION .text %endif dec block_height jg .x_zero_y_zero_loop - STORE_AND_RET + STORE_AND_RET %1 .x_zero_y_nonzero: cmp y_offsetd, 4 @@ -296,37 +316,41 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m2, [srcq+src_strideq] + movx m0, [srcq] + movx m2, [srcq+src_strideq] %if %2 == 1 ; avg -%if mmsize == 16 +%if %1 > 4 movhps m2, [srcq+src_strideq*2] -%else ; mmsize == 8 -%if %1 == 4 - movh m1, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] punpckldq m2, m1 -%else - punpckldq m2, [srcq+src_strideq*2] -%endif %endif - movh m1, [dstq] -%if mmsize == 16 + movx m1, [dstq] +%if %1 > 4 movlhps m0, m2 -%else ; mmsize == 8 +%else ; 4xh punpckldq m0, m2 %endif - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] pavgb m0, m2 punpcklbw m1, m5 +%if %1 > 4 pavgb m0, [secq] punpcklbw m3, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %else ; !avg - movh m4, [srcq+src_strideq*2] - movh m1, [dstq] + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] pavgb m0, m2 - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -343,7 +367,7 @@ SECTION .text %endif dec block_height jg .x_zero_y_half_loop - STORE_AND_RET + STORE_AND_RET %1 .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation @@ -351,7 +375,7 @@ SECTION .text lea bilin_filter, [bilin_filter_m] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] @@ -424,12 +448,12 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m2, [srcq+src_strideq] - movh m4, [srcq+src_strideq*2] - movh m3, [dstq+dst_strideq] + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) - movh m1, [dstq] + movx m1, [dstq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -449,17 +473,27 @@ SECTION .text pmullw m4, filter_y_b paddw m0, m1 paddw m2, filter_rnd - movh m1, [dstq] + movx m1, [dstq] paddw m2, m4 %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif packuswb m0, m2 +%if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 @@ -475,7 +509,7 @@ SECTION .text %undef filter_y_a %undef filter_y_b %undef filter_rnd - STORE_AND_RET + STORE_AND_RET %1 .x_nonzero: cmp x_offsetd, 4 @@ -503,30 +537,40 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m4, [srcq+1] + movx m0, [srcq] + movx m4, [srcq+1] %if %2 == 1 ; avg -%if mmsize == 16 +%if %1 > 4 movhps m0, [srcq+src_strideq] movhps m4, [srcq+src_strideq+1] -%else ; mmsize == 8 - punpckldq m0, [srcq+src_strideq] - punpckldq m4, [srcq+src_strideq+1] -%endif - movh m1, [dstq] - movh m3, [dstq+dst_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] pavgb m0, m4 punpcklbw m3, m5 +%if %1 > 4 pavgb m0, [secq] punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %else ; !avg - movh m2, [srcq+src_strideq] - movh m1, [dstq] + movx m2, [srcq+src_strideq] + movx m1, [dstq] pavgb m0, m4 - movh m4, [srcq+src_strideq+1] - movh m3, [dstq+dst_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -543,7 +587,7 @@ SECTION .text %endif dec block_height jg .x_half_y_zero_loop - STORE_AND_RET + STORE_AND_RET %1 .x_half_y_nonzero: cmp y_offsetd, 4 @@ -578,53 +622,58 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m3, [srcq+1] + movx m0, [srcq] + movx m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 .x_half_y_half_loop: - movh m2, [srcq] - movh m3, [srcq+1] + movx m2, [srcq] + movx m3, [srcq+1] %if %2 == 1 ; avg -%if mmsize == 16 +%if %1 > 4 movhps m2, [srcq+src_strideq] movhps m3, [srcq+src_strideq+1] %else -%if %1 == 4 - movh m1, [srcq+src_strideq] + movx m1, [srcq+src_strideq] punpckldq m2, m1 - movh m1, [srcq+src_strideq+1] + movx m1, [srcq+src_strideq+1] punpckldq m3, m1 -%else - punpckldq m2, [srcq+src_strideq] - punpckldq m3, [srcq+src_strideq+1] -%endif %endif pavgb m2, m3 -%if mmsize == 16 +%if %1 > 4 movlhps m0, m2 movhlps m4, m2 -%else ; mmsize == 8 +%else ; 4xh punpckldq m0, m2 - pshufw m4, m2, 0xe + pshuflw m4, m2, 0xe %endif - movh m1, [dstq] + movx m1, [dstq] pavgb m0, m2 - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] +%if %1 > 4 pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif punpcklbw m3, m5 punpcklbw m1, m5 +%if %1 > 4 punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif %else ; !avg - movh m4, [srcq+src_strideq] - movh m1, [srcq+src_strideq+1] + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] pavgb m2, m3 pavgb m4, m1 pavgb m0, m2 pavgb m2, m4 - movh m1, [dstq] - movh m3, [dstq+dst_strideq] + movx m1, [dstq] + movx m3, [dstq+dst_strideq] punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 @@ -641,7 +690,7 @@ SECTION .text %endif dec block_height jg .x_half_y_half_loop - STORE_AND_RET + STORE_AND_RET %1 .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation @@ -649,7 +698,7 @@ SECTION .text lea bilin_filter, [bilin_filter_m] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] @@ -724,23 +773,23 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m3, [srcq+1] + movx m0, [srcq] + movx m3, [srcq+1] add srcq, src_strideq pavgb m0, m3 %if notcpuflag(ssse3) punpcklbw m0, m5 %endif .x_half_y_other_loop: - movh m2, [srcq] - movh m1, [srcq+1] - movh m4, [srcq+src_strideq] - movh m3, [srcq+src_strideq+1] + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] pavgb m2, m1 pavgb m4, m3 - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) - movh m1, [dstq] + movx m1, [dstq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -760,16 +809,26 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m2, m1 - movh m1, [dstq] + movx m1, [dstq] %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif packuswb m0, m2 +%if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 @@ -786,7 +845,7 @@ SECTION .text %undef filter_y_a %undef filter_y_b %undef filter_rnd - STORE_AND_RET + STORE_AND_RET %1 .x_nonhalf: test y_offsetd, y_offsetd @@ -797,7 +856,7 @@ SECTION .text lea bilin_filter, [bilin_filter_m] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] @@ -865,14 +924,14 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m1, [srcq+1] - movh m2, [srcq+src_strideq] - movh m4, [srcq+src_strideq+1] - movh m3, [dstq+dst_strideq] + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] %if cpuflag(ssse3) punpcklbw m0, m1 - movh m1, [dstq] + movx m1, [dstq] punpcklbw m2, m4 pmaddubsw m0, filter_x_a pmaddubsw m2, filter_x_a @@ -892,17 +951,27 @@ SECTION .text pmullw m4, filter_x_b paddw m0, m1 paddw m2, filter_rnd - movh m1, [dstq] + movx m1, [dstq] paddw m2, m4 %endif psraw m0, 4 psraw m2, 4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif packuswb m0, m2 +%if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %endif punpcklbw m1, m5 SUM_SSE m0, m1, m2, m3, m6, m7 @@ -918,7 +987,7 @@ SECTION .text %undef filter_x_a %undef filter_x_b %undef filter_rnd - STORE_AND_RET + STORE_AND_RET %1 .x_nonhalf_y_nonzero: cmp y_offsetd, 4 @@ -929,7 +998,7 @@ SECTION .text lea bilin_filter, [bilin_filter_m] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] @@ -1037,8 +1106,8 @@ SECTION .text add srcq, src_strideq add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m1, [srcq+1] + movx m0, [srcq] + movx m1, [srcq+1] %if cpuflag(ssse3) punpcklbw m0, m1 pmaddubsw m0, filter_x_a @@ -1054,17 +1123,17 @@ SECTION .text add srcq, src_strideq psraw m0, 4 .x_other_y_half_loop: - movh m2, [srcq] - movh m1, [srcq+1] - movh m4, [srcq+src_strideq] - movh m3, [srcq+src_strideq+1] + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movh m1, [dstq] - movh m3, [dstq+dst_strideq] + movx m1, [dstq] + movx m3, [dstq+dst_strideq] paddw m2, filter_rnd paddw m4, filter_rnd %else @@ -1079,9 +1148,9 @@ SECTION .text pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 - movh m1, [dstq] + movx m1, [dstq] paddw m4, m3 - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] %endif psraw m2, 4 psraw m4, 4 @@ -1089,10 +1158,20 @@ SECTION .text pavgw m2, m4 %if %2 == 1 ; avg ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif packuswb m0, m2 +%if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %endif punpcklbw m3, m5 punpcklbw m1, m5 @@ -1110,7 +1189,7 @@ SECTION .text %undef filter_x_a %undef filter_x_b %undef filter_rnd - STORE_AND_RET + STORE_AND_RET %1 .x_nonhalf_y_nonhalf: %ifdef PIC @@ -1118,7 +1197,7 @@ SECTION .text %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] @@ -1261,8 +1340,8 @@ SECTION .text INC_SRC_BY_SRC_STRIDE add dstq, dst_strideq %else ; %1 < 16 - movh m0, [srcq] - movh m1, [srcq+1] + movx m0, [srcq] + movx m1, [srcq+1] %if cpuflag(ssse3) punpcklbw m0, m1 pmaddubsw m0, filter_x_a @@ -1283,20 +1362,20 @@ SECTION .text INC_SRC_BY_SRC_STRIDE .x_other_y_other_loop: - movh m2, [srcq] - movh m1, [srcq+1] + movx m2, [srcq] + movx m1, [srcq+1] INC_SRC_BY_SRC_STRIDE - movh m4, [srcq] - movh m3, [srcq+1] + movx m4, [srcq] + movx m3, [srcq+1] %if cpuflag(ssse3) punpcklbw m2, m1 punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movh m3, [dstq+dst_strideq] - movh m1, [dstq] + movx m3, [dstq+dst_strideq] + movx m1, [dstq] paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 @@ -1335,9 +1414,9 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m0, m3 - movh m3, [dstq+dst_strideq] + movx m3, [dstq+dst_strideq] paddw m2, m1 - movh m1, [dstq] + movx m1, [dstq] psraw m0, 4 psraw m2, 4 punpcklbw m3, m5 @@ -1345,10 +1424,20 @@ SECTION .text %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif packuswb m0, m2 +%if %1 > 4 pavgb m0, [secq] punpckhbw m2, m0, m5 punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif %endif SUM_SSE m0, m1, m2, m3, m6, m7 mova m0, m4 @@ -1366,7 +1455,8 @@ SECTION .text %undef filter_y_a %undef filter_y_b %undef filter_rnd - STORE_AND_RET +%undef movx + STORE_AND_RET %1 %endmacro ; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical @@ -1375,26 +1465,22 @@ SECTION .text ; location in the sse/2 version, rather than duplicating that code in the ; binary. -INIT_MMX sse -SUBPEL_VARIANCE 4 INIT_XMM sse2 +SUBPEL_VARIANCE 4 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 -INIT_MMX ssse3 -SUBPEL_VARIANCE 4 INIT_XMM ssse3 +SUBPEL_VARIANCE 4 SUBPEL_VARIANCE 8 SUBPEL_VARIANCE 16 -INIT_MMX sse -SUBPEL_VARIANCE 4, 1 INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 SUBPEL_VARIANCE 8, 1 SUBPEL_VARIANCE 16, 1 -INIT_MMX ssse3 -SUBPEL_VARIANCE 4, 1 INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 SUBPEL_VARIANCE 8, 1 SUBPEL_VARIANCE 16, 1 diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 7851a98b1..f8c97117d 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -45,7 +45,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, int sum; variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, vpx_get16x16var_avx2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); + return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); } unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm deleted file mode 100644 index f4de42a24..000000000 --- a/vpx_dsp/x86/variance_impl_mmx.asm +++ /dev/null @@ -1,343 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define mmx_filter_shift 7 - -;void vpx_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vpx_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE -sym(vpx_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c deleted file mode 100644 index 636231d1c..000000000 --- a/vpx_dsp/x86/variance_mmx.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" - -#include "vpx_ports/mem.h" - -DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - -extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const int16_t *HFilter, - const int16_t *VFilter, - int *sum, - unsigned int *sumsquared); - -extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const int16_t *HFilter, - const int16_t *VFilter, - int *sum, - unsigned int *sumsquared); - - -uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int xsum; - unsigned int xxsum; - vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum, &xxsum); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int xsum; - uint32_t xxsum; - vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum, &xxsum); - *sse = xxsum; - return (xxsum - (((uint32_t)xsum * xsum) >> 6)); -} - -uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0); - - vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8)); -} - -uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0); - - vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7)); -} - -uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse) { - int xsum; - unsigned int xxsum; - vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16, - bilinear_filters_mmx[xoffset], - bilinear_filters_mmx[yoffset], - &xsum, &xxsum); - *sse = xxsum; - return (xxsum - (((uint32_t)xsum * xsum) >> 7)); -} - -uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse); -} - -uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse); -} - -uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse); -} diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 43f4603ca..6987c2e24 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -320,11 +320,11 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, int height, unsigned int *sse, \ void *unused0, void *unused) #define DECLS(opt1, opt2) \ - DECL(4, opt2); \ + DECL(4, opt1); \ DECL(8, opt1); \ DECL(16, opt1) -DECLS(sse2, sse); +DECLS(sse2, sse2); DECLS(ssse3, ssse3); #undef DECLS #undef DECL @@ -380,10 +380,10 @@ FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ -FN(4, 8, 4, 2, 3, opt2, (int32_t), (int32_t)); \ -FN(4, 4, 4, 2, 2, opt2, (int32_t), (int32_t)) +FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ +FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) -FNS(sse2, sse); +FNS(sse2, sse2); FNS(ssse3, ssse3); #undef FNS @@ -401,11 +401,11 @@ int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ int height, unsigned int *sse, \ void *unused0, void *unused) #define DECLS(opt1, opt2) \ -DECL(4, opt2); \ +DECL(4, opt1); \ DECL(8, opt1); \ DECL(16, opt1) -DECLS(sse2, sse); +DECLS(sse2, sse2); DECLS(ssse3, ssse3); #undef DECL #undef DECLS @@ -466,8 +466,8 @@ FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ -FN(4, 8, 4, 2, 3, opt2, (uint32_t), (int32_t)); \ -FN(4, 4, 4, 2, 2, opt2, (uint32_t), (int32_t)) +FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ +FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) FNS(sse2, sse); FNS(ssse3, ssse3); diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h index 1f8f914f1..620df31b2 100644 --- a/vpx_ports/mem_ops.h +++ b/vpx_ports/mem_ops.h @@ -89,7 +89,7 @@ static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; - val = mem[0] << 24; + val = ((unsigned MEM_VALUE_T)mem[0]) << 24; val |= mem[1] << 16; val |= mem[2] << 8; val |= mem[3]; @@ -125,7 +125,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { unsigned MEM_VALUE_T val; const MAU_T *mem = (const MAU_T *)vmem; - val = mem[3] << 24; + val = ((unsigned MEM_VALUE_T)mem[3]) << 24; val |= mem[2] << 16; val |= mem[1] << 8; val |= mem[0]; |