diff options
author | Jingning Han <jingning@google.com> | 2013-07-26 14:11:37 -0700 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2013-07-26 14:16:51 -0700 |
commit | 325e0aa6505eb480f5a55e072e195cbc3db0aacf (patch) | |
tree | f9ca4cd8e73ec48217cf2de27583846da8d32fba /vp9/common/x86 | |
parent | 52256cdbcaf07e637c964f92671dfc82321f2125 (diff) | |
download | libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar.gz libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar.bz2 libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.zip |
Special handle on DC only inverse 8x8 2D-DCT
This commit enables a special handle for the 8x8 inverse 2D-DCT,
where only DC coefficient is quantized to be non-zero. For bus_cif
at 2000 kbps, it provides about 1% speed-up at speed 0.
Change-Id: I2523222359eec26b144cf8fd4c63a4ad63b1b011
Diffstat (limited to 'vp9/common/x86')
-rw-r--r-- | vp9/common/x86/vp9_idct_intrin_sse2.c | 27 |
1 files changed, 24 insertions, 3 deletions
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index a1e14b482..726c83f43 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, { \ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ d0 = _mm_unpacklo_epi8(d0, zero); \ - in_x = _mm_add_epi16(in_x, d0); \ - in_x = _mm_packus_epi16(in_x, in_x); \ - _mm_storel_epi64((__m128i *)(dest), in_x); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ dest += stride; \ } @@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } +void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); +} + // perform 8x8 transpose static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |