From e20ca4fead6e48c2af1a5cff05b97c4b4cf2526c Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Fri, 5 Jan 2018 09:57:56 -0800 Subject: Add vp9_highbd_iht4x4_16_add_sse4_1() BUG=webm:1413 Change-Id: I14930d0af24370a44ab359de5bba5512eef4e29f --- vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 26 ++------------------------ vpx_dsp/x86/highbd_inv_txfm_sse2.h | 4 ++++ vpx_dsp/x86/highbd_inv_txfm_sse4.h | 22 ++++++++++++++++++++++ 3 files changed, 28 insertions(+), 24 deletions(-) (limited to 'vpx_dsp') diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c index 38e64f3bc..fe74d272a 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -16,28 +16,6 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" -static INLINE void highbd_idct4(__m128i *const io) { - __m128i temp[2], step[4]; - - transpose_32bit_4x4(io, io); - - // stage 1 - temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] - extend_64bit(temp[0], temp); - step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); - temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] - extend_64bit(temp[0], temp); - step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); - highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], - &step[3]); - - // stage 2 - io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] - io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] - io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] - io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] -} - void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, int stride, int bd) { __m128i io[4]; @@ -59,8 +37,8 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[0] = _mm_srai_epi16(io_short[0], 4); io[1] = _mm_srai_epi16(io_short[1], 4); } else { - highbd_idct4(io); - highbd_idct4(io); + highbd_idct4_sse4_1(io); + highbd_idct4_sse4_1(io); io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); } diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index e0f749552..c89666b1e 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -19,6 +19,10 @@ #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +// Note: There is no 64-bit bit-level shifting SIMD instruction. All +// coefficients are left shifted by 2, so that dct_const_round_shift() can be +// done by right shifting 2 bytes. + static INLINE void extend_64bit(const __m128i in, __m128i *const out /*out[2]*/) { out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 9c8eef40f..435934f1b 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -84,4 +84,26 @@ static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, *out1 = multiplication_round_shift_sse4_1(temp, c1); } +static INLINE void highbd_idct4_sse4_1(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ -- cgit v1.2.3