diff options
Diffstat (limited to 'vpx_dsp/x86/inv_txfm_sse2.h')
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.h | 120 |
1 files changed, 120 insertions, 0 deletions
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 1499b59e5..da34f868e 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -12,6 +12,7 @@ #define VPX_DSP_X86_INV_TXFM_SSE2_H_ #include <emmintrin.h> // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" @@ -88,6 +89,17 @@ static INLINE void multiplication_and_add_2(const __m128i *const in0, *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1); } +// Multiply elements by constants and add them together. +static INLINE void multiplication_and_add( + const __m128i *const in0, const __m128i *const in1, + const __m128i *const in2, const __m128i *const in3, + const __m128i *const cst0, const __m128i *const cst1, + const __m128i *const cst2, const __m128i *const cst3, __m128i *const res0, + __m128i *const res1, __m128i *const res2, __m128i *const res3) { + multiplication_and_add_2(in0, in1, cst0, cst1, res0, res1); + multiplication_and_add_2(in2, in3, cst2, cst3, res2, res3); +} + // Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data4(const tran_low_t *data) { @@ -347,6 +359,114 @@ static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, *x1 = _mm_packs_epi32(tmp2, tmp3); } +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8]; + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, + &cp_n20_12, &cp_12_20, &step1[4], &step1[7], + &step1[5], &step1[6]); + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, + &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], + &step2[1], &step2[2], &step2[3]); + } + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, + &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } + + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); +} + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); |