From 9c43d81bc24896432d7c6a46b8757898dbae91a5 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Fri, 30 Jun 2017 13:55:38 -0700 Subject: Refactor highbd idct 4x4 and 8x8 x86 functions BUG=webm:1412 Change-Id: I221dff34dd5f71b390b5e043d0a137ccb0a01dec --- test/partial_idct_test.cc | 18 +++++ vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 73 +++---------------- vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 26 +++---- vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 105 +++++++-------------------- vpx_dsp/x86/highbd_inv_txfm_sse2.h | 94 +++++++++++++++++++++++- vpx_dsp/x86/highbd_inv_txfm_sse4.h | 47 ++++++------ vpx_dsp/x86/inv_txfm_sse2.c | 133 +++------------------------------- vpx_dsp/x86/inv_txfm_sse2.h | 126 ++++++++++++++++++++++++++++++++ vpx_dsp/x86/txfm_common_sse2.h | 3 + 9 files changed, 317 insertions(+), 308 deletions(-) diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 50a83342f..9267ebb05 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -754,6 +754,24 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), make_tuple( &vpx_highbd_fdct4x4_c, &highbd_wrapper, &highbd_wrapper, TX_4X4, 16, 8, 2), diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index 505342b1c..9b953dd36 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" @@ -72,73 +74,20 @@ static INLINE void highbd_idct4_small_sse2(__m128i *const io) { io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] } -static INLINE void abs_extend_64bit_sse2(const __m128i in, - __m128i *const out /*out[2]*/, - __m128i *const sign /*sign[2]*/) { - sign[0] = _mm_srai_epi32(in, 31); - out[0] = _mm_xor_si128(in, sign[0]); - out[0] = _mm_sub_epi32(out[0], sign[0]); - sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 - sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 - out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 - out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 -} - -static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, - const __m128i sign, - const __m128i cospi) { - __m128i out = _mm_mul_epu32(in, cospi); - out = _mm_xor_si128(out, sign); - return _mm_sub_epi64(out, sign); -} - static INLINE void highbd_idct4_large_sse2(__m128i *const io) { - const __m128i cospi_p16_p16 = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cospi_p08_p08 = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cospi_p24_p24 = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - __m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4]; + __m128i temp[2], sign[2], step[4]; transpose_32bit_4x4(io, io); // stage 1 - temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - abs_extend_64bit_sse2(temp1[0], temp1, sign1); - abs_extend_64bit_sse2(temp2[0], temp2, sign2); - temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p16_p16); - temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16); - temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16); - temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16); - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[0] = pack_4(temp1[0], temp1[1]); - step[1] = pack_4(temp2[0], temp2[1]); - - abs_extend_64bit_sse2(io[1], temp1, sign1); - abs_extend_64bit_sse2(io[3], temp2, sign2); - temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p08_p08); - temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p08_p08); - temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p24_p24); - temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p24_p24); - temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p24_p24); - temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p24_p24); - temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p08_p08); - temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p08_p08); - temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8 - temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8 - temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 - temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 - temp1[0] = dct_const_round_shift_64bit(temp1[0]); - temp1[1] = dct_const_round_shift_64bit(temp1[1]); - temp2[0] = dct_const_round_shift_64bit(temp2[0]); - temp2[1] = dct_const_round_shift_64bit(temp2[1]); - step[2] = pack_4(temp1[0], temp1[1]); - step[3] = pack_4(temp2[0], temp2[1]); + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + abs_extend_64bit_sse2(temp[0], temp, sign); + step[0] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + abs_extend_64bit_sse2(temp[0], temp, sign); + step[1] = multiplication_round_shift_sse2(temp, sign, (int)cospi_16_64); + highbd_multiplication_and_add_sse2(io[1], io[3], (int)cospi_24_64, + (int)cospi_8_64, &step[2], &step[3]); // stage 2 io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c index cc20df03c..d61cae952 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include +#include // SSE4.1 #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" @@ -17,25 +17,19 @@ #include "vpx_dsp/x86/transpose_sse2.h" static INLINE void highbd_idct4(__m128i *const io) { - const __m128i cospi_p16_p16 = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cospi_p08_p08 = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cospi_p24_p24 = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - __m128i temp1[4], step[4]; + __m128i temp[2], step[4]; transpose_32bit_4x4(io, io); // stage 1 - temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - extend_64bit(temp1[0], temp1); - step[0] = multiplication_round_shift(temp1, cospi_p16_p16); - temp1[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - extend_64bit(temp1[0], temp1); - step[1] = multiplication_round_shift(temp1, cospi_p16_p16); - multiplication_and_add_2_ssse4_1(&io[1], &io[3], &cospi_p24_p24, - &cospi_p08_p08, &step[2], &step[3]); + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64); + highbd_multiplication_and_add_sse4_1(io[1], io[3], (int)cospi_24_64, + (int)cospi_8_64, &step[2], &step[3]); // stage 2 io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index 4513c0fb5..80eedec5e 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include +#include // SSE4.1 #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" @@ -18,20 +18,6 @@ #include "vpx_dsp/x86/transpose_sse2.h" static void highbd_idct8x8_half1d(__m128i *const io) { - const __m128i cp_4q_4q = - _mm_setr_epi32((int)cospi_4_64 << 2, 0, (int)cospi_4_64 << 2, 0); - const __m128i cp_8q_8q = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cp_12q_12q = - _mm_setr_epi32((int)cospi_12_64 << 2, 0, (int)cospi_12_64 << 2, 0); - const __m128i cp_16q_16q = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cp_20q_20q = - _mm_setr_epi32((int)cospi_20_64 << 2, 0, (int)cospi_20_64 << 2, 0); - const __m128i cp_24q_24q = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); - const __m128i cp_28q_28q = - _mm_setr_epi32((int)cospi_28_64 << 2, 0, (int)cospi_28_64 << 2, 0); __m128i temp1[4], temp2[4], step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -41,20 +27,20 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[2] = io[4]; step1[1] = io[2]; step1[3] = io[6]; - multiplication_and_add_2_ssse4_1(&io[1], &io[7], &cp_28q_28q, &cp_4q_4q, - &step1[4], &step1[7]); - multiplication_and_add_2_ssse4_1(&io[5], &io[3], &cp_12q_12q, &cp_20q_20q, - &step1[5], &step1[6]); + highbd_multiplication_and_add_sse4_1(io[1], io[7], (int)cospi_28_64, + (int)cospi_4_64, &step1[4], &step1[7]); + highbd_multiplication_and_add_sse4_1(io[5], io[3], (int)cospi_12_64, + (int)cospi_20_64, &step1[5], &step1[6]); // stage 2 temp2[0] = _mm_add_epi32(step1[0], step1[2]); extend_64bit(temp2[0], temp1); - step2[0] = multiplication_round_shift(temp1, cp_16q_16q); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_sub_epi32(step1[0], step1[2]); extend_64bit(temp2[0], temp1); - step2[1] = multiplication_round_shift(temp1, cp_16q_16q); - multiplication_and_add_2_ssse4_1(&step1[1], &step1[3], &cp_24q_24q, &cp_8q_8q, - &step2[2], &step2[3]); + step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_multiplication_and_add_sse4_1(step1[1], step1[3], (int)cospi_24_64, + (int)cospi_8_64, &step2[2], &step2[3]); step2[4] = _mm_add_epi32(step1[4], step1[5]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -68,38 +54,17 @@ static void highbd_idct8x8_half1d(__m128i *const io) { step1[4] = step2[4]; temp2[0] = _mm_sub_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift(temp1, cp_16q_16q); + step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_add_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift(temp1, cp_16q_16q); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); step1[7] = step2[7]; // stage 4 - io[0] = _mm_add_epi32(step1[0], step1[7]); - io[1] = _mm_add_epi32(step1[1], step1[6]); - io[2] = _mm_add_epi32(step1[2], step1[5]); - io[3] = _mm_add_epi32(step1[3], step1[4]); - io[4] = _mm_sub_epi32(step1[3], step1[4]); - io[5] = _mm_sub_epi32(step1[2], step1[5]); - io[6] = _mm_sub_epi32(step1[1], step1[6]); - io[7] = _mm_sub_epi32(step1[0], step1[7]); + highbd_idct8_stage4(step1, io); } static void highbd_idct8x8_12_half1d(__m128i *const io) { - const __m128i cp_28q_28q = - _mm_setr_epi32((int)cospi_28_64 << 2, 0, (int)cospi_28_64 << 2, 0); - const __m128i cp_4q_4q = - _mm_setr_epi32((int)cospi_4_64 << 2, 0, (int)cospi_4_64 << 2, 0); - const __m128i cp_n20q_n20q = - _mm_setr_epi32(-(int)cospi_20_64 * 4, 0, -(int)cospi_20_64 * 4, 0); - const __m128i cp_12q_12q = - _mm_setr_epi32((int)cospi_12_64 << 2, 0, (int)cospi_12_64 << 2, 0); - const __m128i cp_16q_16q = - _mm_setr_epi32((int)cospi_16_64 << 2, 0, (int)cospi_16_64 << 2, 0); - const __m128i cp_8q_8q = - _mm_setr_epi32((int)cospi_8_64 << 2, 0, (int)cospi_8_64 << 2, 0); - const __m128i cp_24q_24q = - _mm_setr_epi32((int)cospi_24_64 << 2, 0, (int)cospi_24_64 << 2, 0); __m128i temp1[4], temp2[4], step1[8], step2[8]; transpose_32bit_4x4(io, io); @@ -108,18 +73,18 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[0] = io[0]; step1[1] = io[2]; extend_64bit(io[1], temp1); - step1[4] = multiplication_round_shift(temp1, cp_28q_28q); - step1[7] = multiplication_round_shift(temp1, cp_4q_4q); + step1[4] = multiplication_round_shift_sse4_1(temp1, (int)cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, (int)cospi_4_64); extend_64bit(io[3], temp1); - step1[5] = multiplication_round_shift(temp1, cp_n20q_n20q); - step1[6] = multiplication_round_shift(temp1, cp_12q_12q); + step1[5] = multiplication_round_shift_sse4_1(temp1, -(int)cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_12_64); // stage 2 extend_64bit(step1[0], temp1); - step2[0] = multiplication_round_shift(temp1, cp_16q_16q); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); extend_64bit(step1[1], temp1); - step2[2] = multiplication_round_shift(temp1, cp_24q_24q); - step2[3] = multiplication_round_shift(temp1, cp_8q_8q); + step2[2] = multiplication_round_shift_sse4_1(temp1, (int)cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, (int)cospi_8_64); step2[4] = _mm_add_epi32(step1[4], step1[5]); step2[5] = _mm_sub_epi32(step1[4], step1[5]); step2[6] = _mm_sub_epi32(step1[7], step1[6]); @@ -133,21 +98,14 @@ static void highbd_idct8x8_12_half1d(__m128i *const io) { step1[4] = step2[4]; temp2[0] = _mm_sub_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[5] = multiplication_round_shift(temp1, cp_16q_16q); + step1[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); temp2[0] = _mm_add_epi32(step2[6], step2[5]); extend_64bit(temp2[0], temp1); - step1[6] = multiplication_round_shift(temp1, cp_16q_16q); + step1[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); step1[7] = step2[7]; // stage 4 - io[0] = _mm_add_epi32(step1[0], step1[7]); - io[1] = _mm_add_epi32(step1[1], step1[6]); - io[2] = _mm_add_epi32(step1[2], step1[5]); - io[3] = _mm_add_epi32(step1[3], step1[4]); - io[4] = _mm_sub_epi32(step1[3], step1[4]); - io[5] = _mm_sub_epi32(step1[2], step1[5]); - io[6] = _mm_sub_epi32(step1[1], step1[6]); - io[7] = _mm_sub_epi32(step1[0], step1[7]); + highbd_idct8_stage4(step1, io); } void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, @@ -210,20 +168,14 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[6] = io[10]; io[7] = io[11]; highbd_idct8x8_half1d(io); + io[8] = temp[0]; io[9] = temp[1]; io[10] = temp[2]; io[11] = temp[3]; highbd_idct8x8_half1d(&io[8]); - io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); - io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); - io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); - io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); - io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); - io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); - io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); - io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); + highbd_idct8x8_final_round(io); } recon_and_store_8(io, dest, stride, bd); @@ -266,14 +218,7 @@ void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[11] = temp[3]; highbd_idct8x8_12_half1d(&io[8]); - io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); - io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); - io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); - io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); - io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); - io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); - io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); - io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); + highbd_idct8x8_final_round(io); } recon_and_store_8(io, dest, stride, bd); diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 2d1f39376..7a303e2ca 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -12,6 +12,7 @@ #define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ #include // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" @@ -44,9 +45,8 @@ static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, } static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { - const __m128i t = _mm_add_epi64( - in, - _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0)); + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); return _mm_srli_si128(t, 2); } @@ -56,6 +56,94 @@ static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 } +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_multiplication_and_add_sse2( + const __m128i in0, const __m128i in1, const int c0, const int c1, + __m128i *const out0, __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[4], sign2[4]; + + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, const int bd) { const __m128i zero = _mm_set1_epi16(0); diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 72d3d5327..170f641d3 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -14,37 +14,38 @@ #include // SSE4.1 #include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vpx_dsp/inv_txfm.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" -static INLINE __m128i multiplication_round_shift(const __m128i *const in, - const __m128i cospi) { +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); __m128i t0, t1; - t0 = _mm_mul_epi32(in[0], cospi); - t1 = _mm_mul_epi32(in[1], cospi); + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); t0 = dct_const_round_shift_64bit(t0); t1 = dct_const_round_shift_64bit(t1); + return pack_4(t0, t1); } -static INLINE void multiplication_and_add_2_ssse4_1(const __m128i *const in0, - const __m128i *const in1, - const __m128i *const cst0, - const __m128i *const cst1, - __m128i *const out0, - __m128i *const out1) { +static INLINE void highbd_multiplication_and_add_sse4_1( + const __m128i in0, const __m128i in1, const int c0, const int c1, + __m128i *const out0, __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); __m128i temp1[4], temp2[4]; - extend_64bit(*in0, temp1); - extend_64bit(*in1, temp2); - temp1[2] = _mm_mul_epi32(temp1[0], *cst1); - temp1[3] = _mm_mul_epi32(temp1[1], *cst1); - temp1[0] = _mm_mul_epi32(temp1[0], *cst0); - temp1[1] = _mm_mul_epi32(temp1[1], *cst0); - temp2[2] = _mm_mul_epi32(temp2[0], *cst0); - temp2[3] = _mm_mul_epi32(temp2[1], *cst0); - temp2[0] = _mm_mul_epi32(temp2[0], *cst1); - temp2[1] = _mm_mul_epi32(temp2[1], *cst1); + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index a188f8337..8a501b111 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" @@ -146,72 +148,6 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -// Multiply elements by constants and add them together. -static INLINE void multiplication_and_add( - const __m128i *const in0, const __m128i *const in1, - const __m128i *const in2, const __m128i *const in3, - const __m128i *const cst0, const __m128i *const cst1, - const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, - __m128i *const res1, __m128i *const res2, __m128i *const res3) { - const __m128i lo_0 = _mm_unpacklo_epi16(*in0, *in1); - const __m128i hi_0 = _mm_unpackhi_epi16(*in0, *in1); - const __m128i lo_1 = _mm_unpacklo_epi16(*in2, *in3); - const __m128i hi_1 = _mm_unpackhi_epi16(*in2, *in3); - *res0 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst0); - *res1 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst1); - *res2 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst2); - *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3); -} - -static INLINE void idct8(const __m128i *const in, __m128i *const out) { - const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - __m128i step1[8], step2[8]; - - // stage 1 - { - const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); - multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, - &cp_n20_12, &cp_12_20, &step1[4], &step1[7], - &step1[5], &step1[6]); - } - - // stage 2 - { - const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); - multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, - &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], - &step2[1], &step2[2], &step2[3]); - } - - step2[4] = _mm_add_epi16(step1[4], step1[5]); - step2[5] = _mm_sub_epi16(step1[4], step1[5]); - step2[6] = _mm_sub_epi16(step1[7], step1[6]); - step2[7] = _mm_add_epi16(step1[7], step1[6]); - - // stage 3 - step1[0] = _mm_add_epi16(step2[0], step2[3]); - step1[1] = _mm_add_epi16(step2[1], step2[2]); - step1[2] = _mm_sub_epi16(step2[1], step2[2]); - step1[3] = _mm_sub_epi16(step2[0], step2[3]); - multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, - &step1[5], &step1[6]); - - // stage 4 - out[0] = _mm_add_epi16(step1[0], step2[7]); - out[1] = _mm_add_epi16(step1[1], step1[6]); - out[2] = _mm_add_epi16(step1[2], step1[5]); - out[3] = _mm_add_epi16(step1[3], step2[4]); - out[4] = _mm_sub_epi16(step1[3], step2[4]); - out[5] = _mm_sub_epi16(step1[2], step1[5]); - out[6] = _mm_sub_epi16(step1[1], step1[6]); - out[7] = _mm_sub_epi16(step1[0], step2[7]); -} - void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i in[8]; @@ -475,66 +411,15 @@ void iadst8_sse2(__m128i *in) { void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - __m128i in[8], step1[8], step2[8], tmp[4]; + __m128i io[8]; - in[0] = load_input_data4(input + 0 * 8); - in[1] = load_input_data4(input + 1 * 8); - in[2] = load_input_data4(input + 2 * 8); - in[3] = load_input_data4(input + 3 * 8); + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); - transpose_16bit_4x4(in, in); - // in[0]: 00 10 20 30 01 11 21 31 - // in[1]: 02 12 22 32 03 13 23 33 - - // stage 1 - { - const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero); - step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 - step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 - } - - // stage 2 - { - const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero); - step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0); // step2 0&1 - step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 - step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 - step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 - step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 - } - - // stage 3 - { - const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); - tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 - tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 - step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 - step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 - step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 - } - - // stage 4 - tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 - tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 - tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 - tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 - - idct8x8_12_transpose_16bit_4x8(tmp, in); - in[4] = in[5] = in[6] = in[7] = zero; - - idct8(in, in); - write_buffer_8x8(in, dest, stride); + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); } #define IDCT16_10 \ diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 1499b59e5..86f1932a5 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -12,6 +12,7 @@ #define VPX_DSP_X86_INV_TXFM_SSE2_H_ #include // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" @@ -347,6 +348,131 @@ static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, *x1 = _mm_packs_epi32(tmp2, tmp3); } +// Multiply elements by constants and add them together. +static INLINE void multiplication_and_add( + const __m128i *const in0, const __m128i *const in1, + const __m128i *const in2, const __m128i *const in3, + const __m128i *const cst0, const __m128i *const cst1, + const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, + __m128i *const res1, __m128i *const res2, __m128i *const res3) { + const __m128i lo_0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i hi_0 = _mm_unpackhi_epi16(*in0, *in1); + const __m128i lo_1 = _mm_unpacklo_epi16(*in2, *in3); + const __m128i hi_1 = _mm_unpackhi_epi16(*in2, *in3); + *res0 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst0); + *res1 = idct_calc_wraplow_sse2(lo_0, hi_0, *cst1); + *res2 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst2); + *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3); +} + +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8]; + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, + &cp_n20_12, &cp_12_20, &step1[4], &step1[7], + &step1[5], &step1[6]); + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, + &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], + &step2[1], &step2[2], &step2[3]); + } + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, + &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } + + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h index f8edb1b78..0a9542c85 100644 --- a/vpx_dsp/x86/txfm_common_sse2.h +++ b/vpx_dsp/x86/txfm_common_sse2.h @@ -18,6 +18,9 @@ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + #define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) -- cgit v1.2.3