diff options
author | Jingning Han <jingning@google.com> | 2017-02-01 12:19:01 -0800 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2017-02-01 14:47:53 -0800 |
commit | 8f953897429f75ce9f44e80feaae810a6c9752bd (patch) | |
tree | ec6cfc17323f3bc588d147ec1637870b135e4857 /vpx_dsp | |
parent | a7949f2dd2bb68516a7ab4bc8c60f9d72231b800 (diff) | |
download | libvpx-8f953897429f75ce9f44e80feaae810a6c9752bd.tar libvpx-8f953897429f75ce9f44e80feaae810a6c9752bd.tar.gz libvpx-8f953897429f75ce9f44e80feaae810a6c9752bd.tar.bz2 libvpx-8f953897429f75ce9f44e80feaae810a6c9752bd.zip |
Add SSSE3 intrinsic 8x8 inverse 2D-DCT
The intrinsic version reduces the average cycles from 183 to 175.
Change-Id: I7c1bcdb0a830266e93d8347aed38120fb3be0e03
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 5 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.c | 31 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_sse2.h | 30 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_ssse3.c | 184 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 50 |
6 files changed, 218 insertions, 83 deletions
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index bb20ea274..675c8cb23 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -195,6 +195,7 @@ DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm endif # ARCH_X86_64 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3cb2011b8..a5b79ba4e 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -678,8 +678,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct4x4_1_add neon sse2/; + # TODO(jingning): Add ssse3 for high bit-depth add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add neon sse2/; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; @@ -766,7 +767,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 487a474a6..e307579cb 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -263,37 +263,6 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ { \ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index d762a04ab..381a5f104 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -46,6 +46,36 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ { \ diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 000000000..da4119c8e --- /dev/null +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + { + /* Stage1 */ + { + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); + + { + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp1 = _mm_madd_epi16(hi_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp3 = _mm_madd_epi16(hi_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp5 = _mm_madd_epi16(hi_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + tmp7 = _mm_madd_epi16(hi_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp1_4 = _mm_packs_epi32(tmp0, tmp1); + stp1_7 = _mm_packs_epi32(tmp2, tmp3); + stp1_5 = _mm_packs_epi32(tmp4, tmp5); + stp1_6 = _mm_packs_epi32(tmp6, tmp7); + } + } + + /* Stage2 */ + { + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); + + { + tmp0 = _mm_add_epi16(in0, in4); + tmp1 = _mm_sub_epi16(in0, in4); + stp2_0 = _mm_mulhrs_epi16(tmp0, stg2_0); + stp2_1 = _mm_mulhrs_epi16(tmp1, stg2_0); + + tmp0 = _mm_madd_epi16(lo_26, stg2_2); + tmp1 = _mm_madd_epi16(hi_26, stg2_2); + tmp2 = _mm_madd_epi16(lo_26, stg2_3); + tmp3 = _mm_madd_epi16(hi_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + + stp2_2 = _mm_packs_epi32(tmp0, tmp1); + stp2_3 = _mm_packs_epi32(tmp2, tmp3); + } + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + /* Stage3 */ + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_sub_epi16(stp2_6, stp2_5); + tmp2 = _mm_add_epi16(stp2_6, stp2_5); + stp1_5 = _mm_mulhrs_epi16(tmp0, stg2_0); + stp1_6 = _mm_mulhrs_epi16(tmp2, stg2_0); + } + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + } + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index a0dbd60e6..6f8cfe580 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -222,56 +222,6 @@ SECTION .text %endmacro INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] - mova m4, [inputq + 128] - packssdw m4, [inputq + 144] - mova m5, [inputq + 160] - packssdw m5, [inputq + 176] - mova m6, [inputq + 192] - packssdw m6, [inputq + 208] - mova m7, [inputq + 224] - packssdw m7, [inputq + 240] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - ; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero cglobal idct8x8_12_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] |