diff options
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.c | 179 |
1 files changed, 87 insertions, 92 deletions
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 99f5570cb..2d0318d99 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -175,46 +175,52 @@ static void multiplication_and_add_2(const __m128i *const in0, } static INLINE void idct8(const __m128i *const in, __m128i *const out) { - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - /* Stage1 */ - multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1, - &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6); - - /* Stage2 */ - multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1, - &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - /* Stage3 */ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5, - &stp1_6); - - /* Stage4 */ - out[0] = _mm_add_epi16(stp1_0, stp2_7); - out[1] = _mm_add_epi16(stp1_1, stp1_6); - out[2] = _mm_add_epi16(stp1_2, stp1_5); - out[3] = _mm_add_epi16(stp1_3, stp2_4); - out[4] = _mm_sub_epi16(stp1_3, stp2_4); - out[5] = _mm_sub_epi16(stp1_2, stp1_5); - out[6] = _mm_sub_epi16(stp1_1, stp1_6); - out[7] = _mm_sub_epi16(stp1_0, stp2_7); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8]; + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28, + &cp_n20_12, &cp_12_20, &step1[4], &step1[7], + &step1[5], &step1[6]); + } + + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16, + &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0], + &step2[1], &step2[2], &step2[3]); + } + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16, + &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); } void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -481,70 +487,59 @@ void iadst8_sse2(__m128i *in) { void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i in[8], step1[8], step2[8], tmp[4]; - __m128i in[8]; - __m128i stp1_2, stp1_3, stp1_4, stp1_5; - __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6; - __m128i tmp[4]; + in[0] = load_input_data(input + 0 * 8); + in[1] = load_input_data(input + 1 * 8); + in[2] = load_input_data(input + 2 * 8); + in[3] = load_input_data(input + 3 * 8); - // Rows. Load 4-row input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - - // 8x4 Transpose transpose_16bit_4x4(in, in); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero); + // in[0]: 00 10 20 30 01 11 21 31 + // in[1]: 02 12 22 32 03 13 23 33 - stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17); - stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35); + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 } - // Stage2 + // stage 2 { - const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero); - - stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04); - stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26); - - tmp[0] = _mm_add_epi16(stp1_4, stp1_5); - tmp[1] = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp[0]; - stp2_5 = _mm_unpacklo_epi64(tmp[1], zero); - stp2_6 = _mm_unpackhi_epi64(tmp[1], zero); + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero); + step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 } - // Stage3 + // stage 3 { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp[0] = _mm_add_epi16(stp2_0, stp2_2); - tmp[1] = _mm_sub_epi16(stp2_0, stp2_2); - stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]); - stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]); - stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0 + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 } - // Stage4 - tmp[0] = _mm_add_epi16(stp1_3, stp2_4); - tmp[1] = _mm_add_epi16(stp1_2, stp1_5); - tmp[2] = _mm_sub_epi16(stp1_3, stp2_4); - tmp[3] = _mm_sub_epi16(stp1_2, stp1_5); + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 idct8x8_12_transpose_16bit_4x8(tmp, in); in[4] = in[5] = in[6] = in[7] = zero; |