From 794a7bedbd43fe062c0e11308938f9793f2facb1 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Thu, 16 May 2013 13:52:15 -0400 Subject: WIP: 8x8 idct/recon merge This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: Iacfd57324fbe2b7beca5d7f3dcae25c976e67f45 --- vp9/decoder/x86/vp9_dequantize_sse2.c | 64 ----------------------------------- 1 file changed, 64 deletions(-) (limited to 'vp9/decoder/x86') diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c index 796fc123c..72036c2d4 100644 --- a/vp9/decoder/x86/vp9_dequantize_sse2.c +++ b/vp9/decoder/x86/vp9_dequantize_sse2.c @@ -58,70 +58,6 @@ void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) { *(int *)dest = _mm_cvtsi128_si32(p2); } -void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) { - const int width = 8; - const __m128i zero = _mm_setzero_si128(); - - // Diff data - const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); - const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width)); - const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width)); - const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width)); - const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width)); - const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width)); - const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width)); - const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width)); - - // Prediction data. - __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); - __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); - __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); - __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); - __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); - __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); - __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); - __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); - - p0 = _mm_unpacklo_epi8(p0, zero); - p1 = _mm_unpacklo_epi8(p1, zero); - p2 = _mm_unpacklo_epi8(p2, zero); - p3 = _mm_unpacklo_epi8(p3, zero); - p4 = _mm_unpacklo_epi8(p4, zero); - p5 = _mm_unpacklo_epi8(p5, zero); - p6 = _mm_unpacklo_epi8(p6, zero); - p7 = _mm_unpacklo_epi8(p7, zero); - - p0 = _mm_add_epi16(p0, d0); - p1 = _mm_add_epi16(p1, d1); - p2 = _mm_add_epi16(p2, d2); - p3 = _mm_add_epi16(p3, d3); - p4 = _mm_add_epi16(p4, d4); - p5 = _mm_add_epi16(p5, d5); - p6 = _mm_add_epi16(p6, d6); - p7 = _mm_add_epi16(p7, d7); - - p0 = _mm_packus_epi16(p0, p1); - p2 = _mm_packus_epi16(p2, p3); - p4 = _mm_packus_epi16(p4, p5); - p6 = _mm_packus_epi16(p6, p7); - - _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); - p0 = _mm_srli_si128(p0, 8); - _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); - - _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); - p2 = _mm_srli_si128(p2, 8); - _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); - - _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); - p4 = _mm_srli_si128(p4, 8); - _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); - - _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); - p6 = _mm_srli_si128(p6, 8); - _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); -} - void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; -- cgit v1.2.3