summaryrefslogtreecommitdiff
path: root/vp9/decoder/x86
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2013-05-15 13:16:02 -0400
committerScott LaVarnway <slavarnway@google.com>2013-05-15 13:16:02 -0400
commita272ff25cd99f47950dddb55e94b370e95b70016 (patch)
tree184ef2adf758d9cf09b4fab283bad047b8666635 /vp9/decoder/x86
parent2cf0d4be122f9951b34115401aad069a9464b4c5 (diff)
downloadlibvpx-a272ff25cd99f47950dddb55e94b370e95b70016.tar
libvpx-a272ff25cd99f47950dddb55e94b370e95b70016.tar.gz
libvpx-a272ff25cd99f47950dddb55e94b370e95b70016.tar.bz2
libvpx-a272ff25cd99f47950dddb55e94b370e95b70016.zip
WIP: 16x16 idct/recon merge
This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: Iea7976b22b1927d24b8004d2a3fddae7ecca3ba1
Diffstat (limited to 'vp9/decoder/x86')
-rw-r--r--vp9/decoder/x86/vp9_dequantize_sse2.c59
1 files changed, 0 insertions, 59 deletions
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 38fd5aaa4..796fc123c 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -122,65 +122,6 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
_mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
}
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
- int stride) {
- const int width = 16;
- int i = 4;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
- do {
- d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
- d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
- d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
- d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
- d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
- d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
- // Prediction data.
- p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
- p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
- p0 = _mm_unpacklo_epi8(p1, zero);
- p1 = _mm_unpackhi_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p3, zero);
- p3 = _mm_unpackhi_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p5, zero);
- p5 = _mm_unpackhi_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p7, zero);
- p7 = _mm_unpackhi_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p1 = _mm_packus_epi16(p2, p3);
- p2 = _mm_packus_epi16(p4, p5);
- p3 = _mm_packus_epi16(p6, p7);
-
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
- _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
- diff += 4 * width;
- dest += 4 * stride;
- } while (--i);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;