diff options
-rw-r--r-- | test/partial_idct_test.cc | 20 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 1 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 73 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 26 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 382 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 105 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_inv_txfm_sse2.h | 94 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_inv_txfm_sse4.h | 47 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.c | 822 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.h | 120 | ||||
-rw-r--r-- | vpx_dsp/x86/txfm_common_sse2.h | 3 |
11 files changed, 867 insertions, 826 deletions
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 50a83342f..52bb18fa6 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -717,6 +717,8 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>, &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>, + &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>, &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>, @@ -755,6 +757,24 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>, + &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2), + make_tuple( &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>, &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2), make_tuple( diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 358d16914..3773c9069 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -591,7 +591,6 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_idct8x8_1_add neon sse2/; specialize qw/vpx_idct16x16_256_add neon sse2/; specialize qw/vpx_idct16x16_38_add neon sse2/; - $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; specialize qw/vpx_idct16x16_10_add neon sse2/; specialize qw/vpx_idct16x16_1_add neon sse2/; specialize qw/vpx_idct32x32_1024_add neon sse2/; diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index 505342b1c..9b953dd36 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" @@ -72,73 +74,20 @@ static INLINE void highbd_idct4_small_sse2(__m128i *const io) { io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] } -static INLINE void abs_extend_64bit_sse2(const __m128i in, - __m128i *const out /*out[2]*/, - __m128i *const sign /*sign[2]*/) { - sign[0] = _mm_srai_epi32(in, 31); - out[0] = _mm_xor_si128(in, sign[0]); - out[0] = _mm_sub_epi32(out[0], sign[0]); - sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 - sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 - out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 - out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 -} - -static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, - const __m128i sign, - const __m128i cospi) { - __m128i out = _mm_mul_epu32(in, cospi); - out = _mm_xor_si128(out, sign); - return _mm_sub_epi64(out, sign); -} - static INLINE void highbd_idct4_large_sse2(__m128i *const io) { - const __m128i cospi_p16_p16 = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cospi_p08_p08 = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cospi_p24_p24 = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - __m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4]; + __m128i temp[2], sign[2], step[4]; transpose_32bit_4x4(io, io); // stage 1 - temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - abs_extend_64bit_sse2(temp1[0], temp1, sign1); - abs_extend_64bit_sse2(temp2[0], temp2, sign2); - temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p16_p16); - temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16); - temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16); - temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16); - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[0] = pack_4(temp1[0], temp1[1]); - step[1] = pack_4(temp2[0], temp2[1]); - - abs_extend_64bit_sse2(io[1], temp1, sign1); - abs_extend_64bit_sse2(io[3], temp2, sign2); - temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p08_p08); - temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p08_p08); - temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p24_p24); - temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p24_p24); - temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p24_p24); - temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p24_p24); - temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p08_p08); - temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p08_p08); - temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8 - temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8 - temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 - temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[2] = pack_4(temp1[0], temp1[1]); - step[3] = pack_4(temp2[0], temp2[1]); + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + abs_extend_64bit_sse2(temp[0], temp, sign); + step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + abs_extend_64bit_sse2(temp[0], temp, sign); + step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); + highbd_multiplication_and_add_sse2(io[1], io[3], (int)cospi_24_64, + (int)cospi_8_64, &step[2], &step[3]); // stage 2 io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c index cc20df03c..d61cae952 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <smmintrin.h> +#include <smmintrin.h> // SSE4.1 #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" @@ -17,25 +17,19 @@ #include "vpx_dsp/x86/transpose_sse2.h" static INLINE void highbd_idct4(__m128i *const io) { - const __m128i cospi_p16_p16 = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cospi_p08_p08 = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cospi_p24_p24 = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - __m128i temp1[4], step[4]; + __m128i temp[2], step[4]; transpose_32bit_4x4(io, io); // stage 1 - temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - extend_64bit(temp1[0], temp1); - step[0] = multiplication_round_shift(temp1, cospi_p16_p16); - temp1[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - extend_64bit(temp1[0], temp1); - step[1] = multiplication_round_shift(temp1, cospi_p16_p16); - multiplication_and_add_2_ssse4_1(&io[1], &io[3], &cospi_p24_p24, - &cospi_p08_p08, &step[2], &step[3]); + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64); + highbd_multiplication_and_add_sse4_1(io[1], io[3], (int)cospi_24_64, + (int)cospi_8_64, &step[2], &step[3]); // stage 2 io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 8eae17581..066266b75 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -8,211 +8,219 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_multiplication_and_add_sse2(io[1], io[7], (int)cospi_28_64, + (int)cospi_4_64, &step1[4], &step1[7]); + highbd_multiplication_and_add_sse2(io[5], io[3], (int)cospi_12_64, + (int)cospi_20_64, &step1[5], &step1[6]); + + // stage 2 + temp2[0] = _mm_add_epi32(step1[0], step1[2]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + temp2[0] = _mm_sub_epi32(step1[0], step1[2]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step2[1] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + highbd_multiplication_and_add_sse2(step1[1], step1[3], (int)cospi_24_64, + (int)cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + temp2[0] = _mm_sub_epi32(step2[6], step2[5]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + temp2[0] = _mm_add_epi32(step2[6], step2[5]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[4], temp2[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + abs_extend_64bit_sse2(io[1], temp1, sign); + step1[4] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_28_64); + step1[7] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_4_64); + abs_extend_64bit_sse2(io[3], temp1, sign); + // step1[5] = -step1[5] + step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_20_64); + step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_12_64); + + // stage 2 + abs_extend_64bit_sse2(step1[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + abs_extend_64bit_sse2(step1[1], temp1, sign); + step2[2] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_24_64); + step2[3] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_8_64); + step2[4] = _mm_sub_epi32(step1[4], step1[5]); + step2[5] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + temp2[0] = _mm_sub_epi32(step2[6], step2[5]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step1[5] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + temp2[0] = _mm_add_epi32(step2[6], step2[5]); + abs_extend_64bit_sse2(temp2[0], temp1, sign); + step1[6] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - transpose_16bit_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + idct8_sse2(io_short); + idct8_sse2(io_short); + round_shift_8x8(io_short, io); } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); } - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = add_clamp(d[i], inptr[i], bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } + recon_and_store_8(io, dest, stride, bd); } void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - transpose_16bit_4x8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = add_clamp(d[i], inptr[i], bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } + idct8x8_12_add_kernel_sse2(io_short); + round_shift_8x8(io_short, io); } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); } + + recon_and_store_8(io, dest, stride, bd); } void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index 4513c0fb5..80eedec5e 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <smmintrin.h> +#include <smmintrin.h> // SSE4.1 #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" @@ -18,20 +18,6 @@ #include "vpx_dsp/x86/transpose_sse2.h" static void highbd_idct8x8_half1d(__m128i *const io) { - const __m128i cp_4q_4q = - _mm_setr_epi32((int)cospi_4_64 << 2, 0, (int)cospi_4_64 << 2, 0); - const __m128i cp_8q_8q = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cp_12q_12q = - _mm_setr_epi32((int)cospi_12_64 << 2, 0, (int)cospi_12_64 << 2, 0); - const __m128i cp_16q_16q = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cp_20q_20q = - _mm_setr_epi32((int)cospi_20_64 << 2, 0, (int)cospi_20_64 << 2, 0); - const __m128i cp_24q_24q = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - const __m128i cp_28q_28q = - _mm_setr_epi32((int)cospi_28_64 << 2, 0, (int)cospi_28_64 << 2, 0); __m128i temp1[4], temp2[4], step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -41,20 +27,20 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[2] = io[4]; step1[1] = io[2]; step1[3] = io[6]; - multiplication_and_add_2_ssse4_1(&io[1], &io[7], &cp_28q_28q, &cp_4q_4q, - &step1[4], &step1[7]); - multiplication_and_add_2_ssse4_1(&io[5], &io[3], &cp_12q_12q, &cp_20q_20q, - &step1[5], &step1[6]); + highbd_multiplication_and_add_sse4_1(io[1], io[7], (int)cospi_28_64, + (int)cospi_4_64, &step1[4], &step1[7]); + highbd_multiplication_and_add_sse4_1(io[5], io[3], (int)cospi_12_64, + (int)cospi_20_64, &step1[5], &step1[6]); // stage 2 temp2[0] = _mm_add_epi32(step1[0], step1[2]); extend_64bit(temp2[0], temp1); - step2[0] = multiplication_round_shift(temp1, cp_16q_16q); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_sub_epi32(step1[0], step1[2]); extend_64bit(temp2[0], temp1); - step2[1] = multiplication_round_shift(temp1, cp_16q_16q); - multiplication_and_add_2_ssse4_1(&step1[1], &step1[3], &cp_24q_24q, &cp_8q_8q, - &step2[2], &step2[3]); + step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_multiplication_and_add_sse4_1(step1[1], step1[3], (int)cospi_24_64, + (int)cospi_8_64, &step2[2], &step2[3]); step2[4] = _mm_add_epi32(step1[4], step1[5]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -68,38 +54,17 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[4] = step2[4]; temp2[0] = _mm_sub_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift(temp1, cp_16q_16q); + step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_add_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift(temp1, cp_16q_16q); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); step1[7] = step2[7]; // stage 4 - io[0] = _mm_add_epi32(step1[0], step1[7]); - io[1] = _mm_add_epi32(step1[1], step1[6]); - io[2] = _mm_add_epi32(step1[2], step1[5]); - io[3] = _mm_add_epi32(step1[3], step1[4]); - io[4] = _mm_sub_epi32(step1[3], step1[4]); - io[5] = _mm_sub_epi32(step1[2], step1[5]); - io[6] = _mm_sub_epi32(step1[1], step1[6]); - io[7] = _mm_sub_epi32(step1[0], step1[7]); + highbd_idct8_stage4(step1, io); } static void highbd_idct8x8_12_half1d(__m128i *const io) { - const __m128i cp_28q_28q = - _mm_setr_epi32((int)cospi_28_64 << 2, 0, (int)cospi_28_64 << 2, 0); - const __m128i cp_4q_4q = - _mm_setr_epi32((int)cospi_4_64 << 2, 0, (int)cospi_4_64 << 2, 0); - const __m128i cp_n20q_n20q = - _mm_setr_epi32(-(int)cospi_20_64 * 4, 0, -(int)cospi_20_64 * 4, 0); - const __m128i cp_12q_12q = - _mm_setr_epi32((int)cospi_12_64 << 2, 0, (int)cospi_12_64 << 2, 0); - const __m128i cp_16q_16q = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cp_8q_8q = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cp_24q_24q = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); __m128i temp1[4], temp2[4], step1[8], step2[8]; transpose_32bit_4x4(io, io); @@ -108,18 +73,18 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[0] = io[0]; step1[1] = io[2]; extend_64bit(io[1], temp1); - step1[4] = multiplication_round_shift(temp1, cp_28q_28q); - step1[7] = multiplication_round_shift(temp1, cp_4q_4q); + step1[4] = multiplication_round_shift_sse4_1(temp1, (int)cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, (int)cospi_4_64); extend_64bit(io[3], temp1); - step1[5] = multiplication_round_shift(temp1, cp_n20q_n20q); - step1[6] = multiplication_round_shift(temp1, cp_12q_12q); + step1[5] = multiplication_round_shift_sse4_1(temp1, -(int)cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_12_64); // stage 2 extend_64bit(step1[0], temp1); - step2[0] = multiplication_round_shift(temp1, cp_16q_16q); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); extend_64bit(step1[1], temp1); - step2[2] = multiplication_round_shift(temp1, cp_24q_24q); - step2[3] = multiplication_round_shift(temp1, cp_8q_8q); + step2[2] = multiplication_round_shift_sse4_1(temp1, (int)cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, (int)cospi_8_64); step2[4] = _mm_add_epi32(step1[4], step1[5]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -133,21 +98,14 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[4] = step2[4]; temp2[0] = _mm_sub_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift(temp1, cp_16q_16q); + step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_add_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift(temp1, cp_16q_16q); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); step1[7] = step2[7]; // stage 4 - io[0] = _mm_add_epi32(step1[0], step1[7]); - io[1] = _mm_add_epi32(step1[1], step1[6]); - io[2] = _mm_add_epi32(step1[2], step1[5]); - io[3] = _mm_add_epi32(step1[3], step1[4]); - io[4] = _mm_sub_epi32(step1[3], step1[4]); - io[5] = _mm_sub_epi32(step1[2], step1[5]); - io[6] = _mm_sub_epi32(step1[1], step1[6]); - io[7] = _mm_sub_epi32(step1[0], step1[7]); + highbd_idct8_stage4(step1, io); } void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, @@ -210,20 +168,14 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[6] = io[10]; io[7] = io[11]; highbd_idct8x8_half1d(io); + io[8] = temp[0]; io[9] = temp[1]; io[10] = temp[2]; io[11] = temp[3]; highbd_idct8x8_half1d(&io[8]); - io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); - io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); - io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); - io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); - io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); - io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); - io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); - io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); + highbd_idct8x8_final_round(io); } recon_and_store_8(io, dest, stride, bd); @@ -266,14 +218,7 @@ void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[11] = temp[3]; highbd_idct8x8_12_half1d(&io[8]); - io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); - io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); - io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); - io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); - io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); - io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); - io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); - io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); + highbd_idct8x8_final_round(io); } recon_and_store_8(io, dest, stride, bd); diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 2d1f39376..7a303e2ca 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -12,6 +12,7 @@ #define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ #include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" @@ -44,9 +45,8 @@ static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, } static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { - const __m128i t = _mm_add_epi64( - in, - _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0)); + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); return _mm_srli_si128(t, 2); } @@ -56,6 +56,94 @@ static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 } +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_multiplication_and_add_sse2( + const __m128i in0, const __m128i in1, const int c0, const int c1, + __m128i *const out0, __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[4], sign2[4]; + + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, const int bd) { const __m128i zero = _mm_set1_epi16(0); diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 72d3d5327..170f641d3 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -14,37 +14,38 @@ #include <smmintrin.h> // SSE4.1 #include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vpx_dsp/inv_txfm.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" -static INLINE __m128i multiplication_round_shift(const __m128i *const in, - const __m128i cospi) { +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); __m128i t0, t1; - t0 = _mm_mul_epi32(in[0], cospi); - t1 = _mm_mul_epi32(in[1], cospi); + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); t0 = dct_const_round_shift_64bit(t0); t1 = dct_const_round_shift_64bit(t1); + return pack_4(t0, t1); } -static INLINE void multiplication_and_add_2_ssse4_1(const __m128i *const in0, - const __m128i *const in1, - const __m128i *const cst0, - const __m128i *const cst1, - __m128i *const out0, - __m128i *const out1) { +static INLINE void highbd_multiplication_and_add_sse4_1( + const __m128i in0, const __m128i in1, const int c0, const int c1, + __m128i *const out0, __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); __m128i temp1[4], temp2[4]; - extend_64bit(*in0, temp1); - extend_64bit(*in1, temp2); - temp1[2] = _mm_mul_epi32(temp1[0], *cst1); - temp1[3] = _mm_mul_epi32(temp1[1], *cst1); - temp1[0] = _mm_mul_epi32(temp1[0], *cst0); - temp1[1] = _mm_mul_epi32(temp1[1], *cst0); - temp2[2] = _mm_mul_epi32(temp2[0], *cst0); - temp2[3] = _mm_mul_epi32(temp2[1], *cst0); - temp2[0] = _mm_mul_epi32(temp2[0], *cst1); - temp2[1] = _mm_mul_epi32(temp2[1], *cst1); + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index a188f8337..56f9e660f 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <emmintrin.h> // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" @@ -146,72 +148,6 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -// Multiply elements by constants and add them together. -static INLINE void multiplication_and_add( - const __m128i *const in0, const __m128i *const in1, - const __m128i *const in2, const __m128i *const in3, - const __m128i *const cst0, const __m128i *const cst1, - const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, - __m128i *const res1, __m128i *const res2, __m128i *const res3) { - const __m128i lo_0 = _mm_unpacklo_epi16(*in0, *in1); - const __m128i hi_0 = _mm_unpackhi_epi16(*in0, *in1); - const __m128i lo_1 = _mm_unpacklo_epi16(*in2, *in3); - const __m128i hi_1 = _mm_unpackhi_epi16(*in2, *in3); - *res0 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst0); - *res1 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst1); - *res2 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst2); - *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3); -} - -static INLINE void idct8(const __m128i *const in, __m128i *const out) { - const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - __m128i step1[8], step2[8]; - - // stage 1 - { - const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); - multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, - &cp_n20_12, &cp_12_20, &step1[4], &step1[7], - &step1[5], &step1[6]); - } - - // stage 2 - { - const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); - multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, - &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], - &step2[1], &step2[2], &step2[3]); - } - - step2[4] = _mm_add_epi16(step1[4], step1[5]); - step2[5] = _mm_sub_epi16(step1[4], step1[5]); - step2[6] = _mm_sub_epi16(step1[7], step1[6]); - step2[7] = _mm_add_epi16(step1[7], step1[6]); - - // stage 3 - step1[0] = _mm_add_epi16(step2[0], step2[3]); - step1[1] = _mm_add_epi16(step2[1], step2[2]); - step1[2] = _mm_sub_epi16(step2[1], step2[2]); - step1[3] = _mm_sub_epi16(step2[0], step2[3]); - multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, - &step1[5], &step1[6]); - - // stage 4 - out[0] = _mm_add_epi16(step1[0], step2[7]); - out[1] = _mm_add_epi16(step1[1], step1[6]); - out[2] = _mm_add_epi16(step1[2], step1[5]); - out[3] = _mm_add_epi16(step1[3], step2[4]); - out[4] = _mm_sub_epi16(step1[3], step2[4]); - out[5] = _mm_sub_epi16(step1[2], step1[5]); - out[6] = _mm_sub_epi16(step1[1], step1[6]); - out[7] = _mm_sub_epi16(step1[0], step2[7]); -} - void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i in[8]; @@ -228,6 +164,19 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, write_buffer_8x8(in, dest, stride); } +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; + + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); +} + static INLINE void recon_and_store_8_dual(uint8_t *const dest, const __m128i in_x, const int stride) { @@ -473,127 +422,10 @@ void iadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); } -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - __m128i in[8], step1[8], step2[8], tmp[4]; - - in[0] = load_input_data4(input + 0 * 8); - in[1] = load_input_data4(input + 1 * 8); - in[2] = load_input_data4(input + 2 * 8); - in[3] = load_input_data4(input + 3 * 8); - - transpose_16bit_4x4(in, in); - // in[0]: 00 10 20 30 01 11 21 31 - // in[1]: 02 12 22 32 03 13 23 33 - - // stage 1 - { - const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero); - step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 - step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 - } - - // stage 2 - { - const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero); - step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0); // step2 0&1 - step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 - step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 - step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 - step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 - } - - // stage 3 - { - const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); - tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 - tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 - step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 - step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 - step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 - } - - // stage 4 - tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 - tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 - tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 - tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 - - idct8x8_12_transpose_16bit_4x8(tmp, in); - in[4] = in[5] = in[6] = in[7] = zero; - - idct8(in, in); - write_buffer_8x8(in, dest, stride); -} - -#define IDCT16_10 \ - /* Stage2 */ \ - multiplication_and_add(&in[1], &zero, &zero, &in[3], &stg2_0, &stg2_1, \ - &stg2_6, &stg2_7, &stp1_8_0, &stp1_15, &stp1_11, \ - &stp1_12_0); \ - \ - /* Stage3 */ \ - multiplication_and_add_2(&in[2], &zero, &stg3_0, &stg3_1, &stp2_4, &stp2_7); \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - \ - /* Stage4 */ \ - multiplication_and_add_2(&in[0], &zero, &stg4_0, &stg4_1, &stp1_0, &stp1_1); \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \ - &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14, \ - &stp2_10, &stp2_13); \ - \ - /* Stage5 */ \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \ - &stp1_6); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - \ - /* Stage6 */ \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \ - &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \ - &stp2_11, &stp2_12); - -static INLINE void idct16_8col(__m128i *const in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); +static INLINE void idct16_8col(__m128i *const io /*io[16]*/) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i s[16], t[16]; + __m128i step1[16], step2[16]; // stage 2 { @@ -601,18 +433,20 @@ static INLINE void idct16_8col(__m128i *const in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &k__cospi_p30_m02, + multiplication_and_add(&io[1], &io[15], &io[9], &io[7], &k__cospi_p30_m02, &k__cospi_p02_p30, &k__cospi_p14_m18, - &k__cospi_p18_p14, &s[8], &s[15], &s[9], &s[14]); + &k__cospi_p18_p14, &step2[8], &step2[15], &step2[9], + &step2[14]); } { const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &k__cospi_p22_m10, + multiplication_and_add(&io[5], &io[11], &io[13], &io[3], &k__cospi_p22_m10, &k__cospi_p10_p22, &k__cospi_p06_m26, - &k__cospi_p26_p06, &s[10], &s[13], &s[11], &s[12]); + &k__cospi_p26_p06, &step2[10], &step2[13], + &step2[11], &step2[12]); } // stage 3 @@ -621,103 +455,110 @@ static INLINE void idct16_8col(__m128i *const in) { const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &k__cospi_p28_m04, + multiplication_and_add(&io[2], &io[14], &io[10], &io[6], &k__cospi_p28_m04, &k__cospi_p04_p28, &k__cospi_p12_m20, - &k__cospi_p20_p12, &t[4], &t[7], &t[5], &t[6]); + &k__cospi_p20_p12, &step1[4], &step1[7], &step1[5], + &step1[6]); } - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[10], step2[11]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[14], step2[15]); // stage 4 { - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &k__cospi_p16_p16, - &k__cospi_p16_m16, &k__cospi_p24_m08, - &k__cospi_p08_p24, &s[0], &s[1], &s[2], &s[3]); - } - s[5] = _mm_sub_epi16(t[4], t[5]); - t[4] = _mm_add_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - t[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - { const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - multiplication_and_add(&t[9], &t[14], &t[10], &t[13], &k__cospi_m08_p24, - &k__cospi_p24_p08, &k__cospi_m24_m08, - &k__cospi_m08_p24, &s[9], &s[14], &s[10], &s[13]); + multiplication_and_add(&io[8], &io[0], &io[12], &io[4], &k__cospi_p16_p16, + &k__cospi_m16_p16, &k__cospi_m08_p24, + &k__cospi_p24_p08, &step2[0], &step2[1], &step2[2], + &step2[3]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step1[4] = _mm_add_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step1[7] = _mm_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + multiplication_and_add(&step1[9], &step1[14], &step1[10], &step1[13], + &k__cospi_m08_p24, &k__cospi_p24_p08, + &k__cospi_m24_m08, &k__cospi_m08_p24, &step2[9], + &step2[14], &step2[10], &step2[13]); } - s[11] = t[11]; - s[12] = t[12]; - s[15] = t[15]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16, - &t[5], &t[6]); - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[5], &step2[6], &k__cospi_m16_p16, + &k__cospi_p16_p16, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - multiplication_and_add(&t[10], &t[13], &t[11], &t[12], &k__cospi_m16_p16, - &k__cospi_p16_p16, &k__cospi_m16_p16, - &k__cospi_p16_p16, &s[10], &s[13], &s[11], &s[12]); + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[2], step1[5]); + step2[3] = _mm_add_epi16(step1[3], step1[4]); + step2[4] = _mm_sub_epi16(step1[3], step1[4]); + step2[5] = _mm_sub_epi16(step1[2], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + multiplication_and_add(&step1[10], &step1[13], &step1[11], &step1[12], + &k__cospi_m16_p16, &k__cospi_p16_p16, + &k__cospi_m16_p16, &k__cospi_p16_p16, &step2[10], + &step2[13], &step2[11], &step2[12]); // stage 7 - in[0] = _mm_add_epi16(s[0], t[15]); - in[1] = _mm_add_epi16(s[1], t[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], t[9]); - in[7] = _mm_add_epi16(s[7], t[8]); - in[8] = _mm_sub_epi16(s[7], t[8]); - in[9] = _mm_sub_epi16(s[6], t[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], t[14]); - in[15] = _mm_sub_epi16(s[0], t[15]); + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); } static INLINE void idct16_load8x8(const tran_low_t *const input, __m128i *const in) { - in[0] = load_input_data8(input); - in[1] = load_input_data8(input + 8 * 2); - in[2] = load_input_data8(input + 8 * 4); - in[3] = load_input_data8(input + 8 * 6); - in[4] = load_input_data8(input + 8 * 8); - in[5] = load_input_data8(input + 8 * 10); - in[6] = load_input_data8(input + 8 * 12); - in[7] = load_input_data8(input + 8 * 14); + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); +} + +static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store(dest, out); } void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -742,12 +583,265 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, transpose_16bit_8x8(r + i * 8, out + 8); idct16_8col(out); - // Final rounding and shift for (j = 0; j < 16; ++j) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - out[j] = _mm_adds_epi16(out[j], final_rounding); - out[j] = _mm_srai_epi16(out[j], 6); - recon_and_store(dest + j * stride, out[j]); + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], out[16]; + int i; + + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + in[8] = _mm_setzero_si128(); + in[9] = _mm_setzero_si128(); + in[10] = _mm_setzero_si128(); + in[11] = _mm_setzero_si128(); + in[12] = _mm_setzero_si128(); + in[13] = _mm_setzero_si128(); + in[14] = _mm_setzero_si128(); + in[15] = _mm_setzero_si128(); + idct16_8col(in); + + for (i = 0; i < 2; i++) { + int j; + transpose_16bit_8x8(in + i * 8, out); + out[8] = _mm_setzero_si128(); + out[9] = _mm_setzero_si128(); + out[10] = _mm_setzero_si128(); + out[11] = _mm_setzero_si128(); + out[12] = _mm_setzero_si128(); + out[13] = _mm_setzero_si128(); + out[14] = _mm_setzero_si128(); + out[15] = _mm_setzero_si128(); + idct16_8col(out); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, + __m128i *const output /*output[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x4(input, output); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); + step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, + lo_1_15); // step2 8&15 + step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, + lo_13_3); // step2 11&12 + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); + step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, + lo_2_14); // step1 4&7 + step1[13] = _mm_unpackhi_epi64(step2[11], zero); + step1[14] = _mm_unpackhi_epi64(step2[8], zero); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); + const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); + const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); + step1[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, + lo_9_14); // step2 9&14 + step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, + lo_10_13); // step2 10&13 + step2[6] = _mm_unpackhi_epi64(step1[4], zero); + } + + // stage 5 + { + const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); + step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, + lo_5_6); // step1 6&5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_unpackhi_epi64(step1[11], zero); + step1[13] = _mm_unpackhi_epi64(step1[10], zero); + step1[14] = _mm_unpackhi_epi64(step1[9], zero); + step1[15] = _mm_unpackhi_epi64(step1[8], zero); + } + + // stage 6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); + step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_10_13); // step2 10&13 + step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_11_12); // step2 11&12 + step2[13] = _mm_unpackhi_epi64(step2[10], zero); + step2[12] = _mm_unpackhi_epi64(step2[11], zero); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[0] = _mm_unpackhi_epi64(step2[3], zero); + step2[2] = _mm_unpackhi_epi64(step2[1], zero); + step2[5] = _mm_unpackhi_epi64(step2[6], zero); + step2[7] = _mm_unpackhi_epi64(step2[4], zero); + } + + // stage 7. Left 8x16 only. + output[0] = _mm_add_epi16(step2[0], step1[15]); + output[1] = _mm_add_epi16(step2[1], step1[14]); + output[2] = _mm_add_epi16(step2[2], step2[13]); + output[3] = _mm_add_epi16(step2[3], step2[12]); + output[4] = _mm_add_epi16(step2[4], step2[11]); + output[5] = _mm_add_epi16(step2[5], step2[10]); + output[6] = _mm_add_epi16(step2[6], step1[9]); + output[7] = _mm_add_epi16(step2[7], step1[8]); + output[8] = _mm_sub_epi16(step2[7], step1[8]); + output[9] = _mm_sub_epi16(step2[6], step1[9]); + output[10] = _mm_sub_epi16(step2[5], step2[10]); + output[11] = _mm_sub_epi16(step2[4], step2[11]); + output[12] = _mm_sub_epi16(step2[3], step2[12]); + output[13] = _mm_sub_epi16(step2[2], step2[13]); + output[14] = _mm_sub_epi16(step2[1], step1[14]); + output[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, + __m128i *const io /*io[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x8(l, io); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + multiplication_and_add(&io[1], &zero, &zero, &io[3], &k__cospi_p30_m02, + &k__cospi_p02_p30, &k__cospi_p06_m26, + &k__cospi_p26_p06, &step2[8], &step2[15], &step2[11], + &step2[12]); + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + multiplication_and_add_2(&io[2], &zero, &k__cospi_p28_m04, + &k__cospi_p04_p28, &step1[4], &step1[7]); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + multiplication_and_add_2(&zero, &io[0], &k__cospi_p16_p16, + &k__cospi_m16_p16, &step1[0], &step1[1]); + multiplication_and_add(&step2[8], &step2[15], &step2[11], &step2[12], + &k__cospi_m08_p24, &k__cospi_p24_p08, + &k__cospi_m24_m08, &k__cospi_m08_p24, &step2[9], + &step2[14], &step2[10], &step2[13]); + } + + // stage 5 + multiplication_and_add_2(&step1[4], &step1[7], &k__cospi_m16_p16, + &k__cospi_p16_p16, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[1], step1[5]); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[5] = _mm_sub_epi16(step1[1], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + multiplication_and_add(&step1[10], &step1[13], &step1[11], &step1[12], + &k__cospi_m16_p16, &k__cospi_p16_p16, + &k__cospi_m16_p16, &k__cospi_p16_p16, &step2[10], + &step2[13], &step2[11], &step2[12]); + + // stage 7 + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], l[16]; + int i; + + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + idct16x16_10_pass2(l + 8 * i, in); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, in[j]); } dest += 8; @@ -1216,182 +1310,6 @@ void iadst16_sse2(__m128i *in0, __m128i *in1) { iadst16_8col(in1); } -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data4(input + 0 * 16); - in[1] = load_input_data4(input + 1 * 16); - in[2] = load_input_data4(input + 2 * 16); - in[3] = load_input_data4(input + 3 * 16); - - transpose_16bit_4x4(in, in); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - stp2_8 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_1_15); - stp2_11 = idct_calc_wraplow_sse2(stg2_6, stg2_7, lo_13_3); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - stp1_4 = idct_calc_wraplow_sse2(stg3_0, stg3_1, lo_2_14); - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = idct_madd_round_shift_sse2(lo_0_8, stg4_0); - tmp1 = idct_madd_round_shift_sse2(lo_0_8, stg4_1); - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp1, tmp1); - stp2_9 = idct_calc_wraplow_sse2(stg4_4, stg4_5, lo_9_14); - stp2_10 = idct_calc_wraplow_sse2(stg4_6, stg4_7, lo_10_13); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - stp1_6 = idct_calc_wraplow_sse2(stg4_0, stg4_1, lo_6_5); - tmp0 = idct_madd_round_shift_sse2(lo_10_13, stg6_0); - tmp1 = idct_madd_round_shift_sse2(lo_10_13, stg4_0); - tmp2 = idct_madd_round_shift_sse2(lo_11_12, stg6_0); - tmp3 = idct_madd_round_shift_sse2(lo_11_12, stg4_0); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp1, zero); - stp2_11 = _mm_packs_epi32(tmp2, zero); - stp2_12 = _mm_packs_epi32(tmp3, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - transpose_16bit_4x8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - recon_and_store(dest + j * stride, in[j]); - } - - dest += 8; - } -} - #define IDCT32_34 \ /* Stage1 */ \ multiplication_and_add_2(&in[1], &zero, &stg1_0, &stg1_1, &stp1_16, \ @@ -1594,7 +1512,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); // idct constants for each stage const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); @@ -1726,10 +1643,7 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, in[31] = _mm_sub_epi16(stp1_0, stp1_31); for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - recon_and_store(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, in[j]); } dest += 8; diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 1499b59e5..da34f868e 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -12,6 +12,7 @@ #define VPX_DSP_X86_INV_TXFM_SSE2_H_ #include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" @@ -88,6 +89,17 @@ static INLINE void multiplication_and_add_2(const __m128i *const in0, *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1); } +// Multiply elements by constants and add them together. +static INLINE void multiplication_and_add( + const __m128i *const in0, const __m128i *const in1, + const __m128i *const in2, const __m128i *const in3, + const __m128i *const cst0, const __m128i *const cst1, + const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, + __m128i *const res1, __m128i *const res2, __m128i *const res3) { + multiplication_and_add_2(in0, in1, cst0, cst1, res0, res1); + multiplication_and_add_2(in2, in3, cst2, cst3, res2, res3); +} + // Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data4(const tran_low_t *data) { @@ -347,6 +359,114 @@ static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, *x1 = _mm_packs_epi32(tmp2, tmp3); } +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8]; + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, + &cp_n20_12, &cp_12_20, &step1[4], &step1[7], + &step1[5], &step1[6]); + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, + &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], + &step2[1], &step2[2], &step2[3]); + } + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, + &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } + + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h index f8edb1b78..0a9542c85 100644 --- a/vpx_dsp/x86/txfm_common_sse2.h +++ b/vpx_dsp/x86/txfm_common_sse2.h @@ -18,6 +18,9 @@ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + #define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) |