diff options
-rw-r--r-- | test/partial_idct_test.cc | 30 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 471 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_inv_txfm_sse4.h | 15 |
5 files changed, 518 insertions, 5 deletions
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index 9c409e9a7..8764a78de 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -765,6 +765,36 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16, + 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>, + &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2), make_tuple( &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>, &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2), diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ae98eb23d..705b3a610 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -246,6 +246,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 99ef262b1..1dc3b1155 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -656,9 +656,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/; specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/; specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/; - specialize qw/vpx_highbd_idct16x16_256_add neon sse2/; - specialize qw/vpx_highbd_idct16x16_38_add neon sse2/; - specialize qw/vpx_highbd_idct16x16_10_add neon sse2/; + specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/; + specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/; specialize qw/vpx_highbd_idct32x32_1024_add neon/; specialize qw/vpx_highbd_idct32x32_135_add neon/; specialize qw/vpx_highbd_idct32x32_34_add neon/; diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c new file mode 100644 index 000000000..f25d8e5ee --- /dev/null +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <smmintrin.h> // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + __m128i temp1[2], temp2; + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + temp2 = _mm_sub_epi32(in[6], in[5]); + extend_64bit(temp2, temp1); + out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + temp2 = _mm_add_epi32(in[6], in[5]); + extend_64bit(temp2, temp1); + out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + __m128i temp1[2], temp2; + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + temp2 = _mm_sub_epi32(in[13], in[10]); + extend_64bit(temp2, temp1); + out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + temp2 = _mm_add_epi32(in[13], in[10]); + extend_64bit(temp2, temp1); + out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + + temp2 = _mm_sub_epi32(in[12], in[11]); + extend_64bit(temp2, temp1); + out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + temp2 = _mm_add_epi32(in[12], in[11]); + extend_64bit(temp2, temp1); + out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[4], temp2; + + // stage 2 + highbd_multiplication_and_add_sse4_1(io[1], io[15], (int)cospi_30_64, + (int)cospi_2_64, &step2[8], &step2[15]); + highbd_multiplication_and_add_sse4_1(io[9], io[7], (int)cospi_14_64, + (int)cospi_18_64, &step2[9], &step2[14]); + highbd_multiplication_and_add_sse4_1(io[5], io[11], (int)cospi_22_64, + (int)cospi_10_64, &step2[10], + &step2[13]); + highbd_multiplication_and_add_sse4_1( + io[13], io[3], (int)cospi_6_64, (int)cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + highbd_multiplication_and_add_sse4_1(io[2], io[14], (int)cospi_28_64, + (int)cospi_4_64, &step1[4], &step1[7]); + highbd_multiplication_and_add_sse4_1(io[10], io[6], (int)cospi_12_64, + (int)cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + temp2 = _mm_add_epi32(io[0], io[8]); + extend_64bit(temp2, temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + temp2 = _mm_sub_epi32(io[0], io[8]); + extend_64bit(temp2, temp1); + step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + highbd_multiplication_and_add_sse4_1(io[4], io[12], (int)cospi_24_64, + (int)cospi_8_64, &step2[2], &step2[3]); + highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64, + (int)cospi_8_64, &step2[9], &step2[14]); + highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64, + (int)cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2]; + + // stage 2 + highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_multiplication_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64, + &step2[9], &step2[14]); + highbd_multiplication_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64, + &step2[10], &step2[13]); + highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, + &step2[11], &step2[12]); + + // stage 3 + highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); + highbd_multiplication_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64, + &step1[5], &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + extend_64bit(io[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64); + step2[1] = step2[0]; + highbd_multiplication_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64, + &step2[2], &step2[3]); + highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64, + (int)cospi_8_64, &step2[9], &step2[14]); + highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64, + (int)cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2]; + + // stage 2 + highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64, + &step2[8], &step2[15]); + highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64, + &step2[11], &step2[12]); + + // stage 3 + highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64, + &step1[4], &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + extend_64bit(io[0], temp); + step2[0] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64, + (int)cospi_8_64, &step2[9], &step2[14]); + highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64, + (int)cospi_24_64, &step2[13], + &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + in[4] = load_pack_8_32bit(input + 4 * 16); + in[5] = load_pack_8_32bit(input + 5 * 16); + in[6] = load_pack_8_32bit(input + 6 * 16); + in[7] = load_pack_8_32bit(input + 7 * 16); + transpose_16bit_8x8(in, in); + + in[8] = load_pack_8_32bit(input + 0 * 16 + 8); + in[9] = load_pack_8_32bit(input + 1 * 16 + 8); + in[10] = load_pack_8_32bit(input + 2 * 16 + 8); + in[11] = load_pack_8_32bit(input + 3 * 16 + 8); + in[12] = load_pack_8_32bit(input + 4 * 16 + 8); + in[13] = load_pack_8_32bit(input + 5 * 16 + 8); + in[14] = load_pack_8_32bit(input + 6 * 16 + 8); + in[15] = load_pack_8_32bit(input + 7 * 16 + 8); + transpose_16bit_8x8(in + 8, in + 8); + idct16_8col(in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); + transpose_32bit_8x4(in, in); + + in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8)); + in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12)); + in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8)); + in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12)); + in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8)); + in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12)); + in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8)); + in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12)); + transpose_32bit_8x4(in + 8, in + 8); + + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + out[0] = all[0][i + 0]; + out[1] = all[1][i + 0]; + out[2] = all[0][i + 1]; + out[3] = all[1][i + 1]; + out[4] = all[0][i + 2]; + out[5] = all[1][i + 2]; + out[6] = all[0][i + 3]; + out[7] = all[1][i + 3]; + transpose_32bit_8x4(out, out); + + out[8] = all[2][i + 0]; + out[9] = all[3][i + 0]; + out[10] = all[2][i + 1]; + out[11] = all[3][i + 1]; + out[12] = all[2][i + 2]; + out[13] = all[3][i + 2]; + out[14] = all[2][i + 3]; + out[15] = all[3][i + 3]; + transpose_32bit_8x4(out + 8, out + 8); + + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + in[4] = load_pack_8_32bit(input + 4 * 16); + in[5] = load_pack_8_32bit(input + 5 * 16); + in[6] = load_pack_8_32bit(input + 6 * 16); + in[7] = load_pack_8_32bit(input + 7 * 16); + transpose_16bit_8x8(in, in); + + in[8] = _mm_setzero_si128(); + in[9] = _mm_setzero_si128(); + in[10] = _mm_setzero_si128(); + in[11] = _mm_setzero_si128(); + in[12] = _mm_setzero_si128(); + in[13] = _mm_setzero_si128(); + in[14] = _mm_setzero_si128(); + in[15] = _mm_setzero_si128(); + idct16_8col(in); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(in + i, out); + out[8] = _mm_setzero_si128(); + out[9] = _mm_setzero_si128(); + out[10] = _mm_setzero_si128(); + out[11] = _mm_setzero_si128(); + out[12] = _mm_setzero_si128(); + out[13] = _mm_setzero_si128(); + out[14] = _mm_setzero_si128(); + out[15] = _mm_setzero_si128(); + idct16_8col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4)); + transpose_32bit_8x4(in, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + out[0] = all[0][i + 0]; + out[1] = all[1][i + 0]; + out[2] = all[0][i + 1]; + out[3] = all[1][i + 1]; + out[4] = all[0][i + 2]; + out[5] = all[1][i + 2]; + out[6] = all[0][i + 3]; + out[7] = all[1][i + 3]; + transpose_32bit_8x4(out, out); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + transpose_32bit_4x4(in, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index d19887d00..17b87a913 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -32,8 +32,8 @@ static INLINE __m128i multiplication_round_shift_sse4_1( static INLINE void highbd_multiplication_and_add_sse4_1( const __m128i in0, const __m128i in1, const int c0, const int c1, __m128i *const out0, __m128i *const out1) { - const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); - const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); __m128i temp1[4], temp2[4]; extend_64bit(in0, temp1); @@ -58,4 +58,15 @@ static INLINE void highbd_multiplication_and_add_sse4_1( *out1 = pack_4(temp2[0], temp2[1]); } +static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2]; + + extend_64bit(in, temp); + *out0 = multiplication_round_shift_sse4_1(temp, c0); + *out1 = multiplication_round_shift_sse4_1(temp, c1); +} + #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ |