summaryrefslogtreecommitdiff
path: root/vp9/common/x86
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2013-07-26 14:11:37 -0700
committerJingning Han <jingning@google.com>2013-07-26 14:16:51 -0700
commit325e0aa6505eb480f5a55e072e195cbc3db0aacf (patch)
treef9ca4cd8e73ec48217cf2de27583846da8d32fba /vp9/common/x86
parent52256cdbcaf07e637c964f92671dfc82321f2125 (diff)
downloadlibvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar
libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar.gz
libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.tar.bz2
libvpx-325e0aa6505eb480f5a55e072e195cbc3db0aacf.zip
Special handle on DC only inverse 8x8 2D-DCT
This commit enables a special handle for the 8x8 inverse 2D-DCT, where only DC coefficient is quantized to be non-zero. For bus_cif at 2000 kbps, it provides about 1% speed-up at speed 0. Change-Id: I2523222359eec26b144cf8fd4c63a4ad63b1b011
Diffstat (limited to 'vp9/common/x86')
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c27
1 files changed, 24 insertions, 3 deletions
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index a1e14b482..726c83f43 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
dest += stride; \
}
@@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+}
+
// perform 8x8 transpose
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);