WIP: 4x4 idct/recon merge

This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: I296604bf73579c45105de0dd1adbcc91bcc53c22
author: Scott LaVarnway <slavarnway@google.com> 2013-05-20 13:03:17 -0400
committer: Scott LaVarnway <slavarnway@google.com> 2013-05-20 13:03:17 -0400
commit: ba48a11130aa88cf20c2c54e43585968ce49e964 (patch)
tree: 6a2b02175ec2efc92b5d8a4dca42e425fa70ba3b /vp9/decoder
parent: 9aa37a51b28596137ba6fbcb1411c070287d6e11 (diff)
download: libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.gz
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.bz2
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.zip
3 files changed, 6 insertions, 88 deletions
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b200e6ccc..b866e95cf 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -998,14 +998,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
                  pc->uv_dc_delta_q == 0 &&
                  pc->uv_ac_delta_q == 0;
   if (xd->lossless) {
-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;
     xd->itxm_add          = vp9_idct_add_lossless_c;
     xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
   } else {
-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    xd->inv_txm4x4        = vp9_short_idct4x4;
     xd->itxm_add          = vp9_idct_add;
     xd->itxm_add_y_block  = vp9_idct_add_y_block;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 10b585b3f..7726598bc 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
   }
 }
 
-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
-                         int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + dest[c]);
-
-    dest += stride;
-    diff += width;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 4, 4);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
   if (tx_type == DCT_DCT) {
     vp9_idct_add(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-    vp9_short_iht4x4(input, output, 4, tx_type);
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   }
 }
 
@@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
 }
 
 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
+    vp9_short_idct4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
     vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
     ((int *)input)[0] = 0;
@@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
 }
 
 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
+  vp9_short_idct4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
                              int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
     ((int *)input)[0] = 0;
   }
 }
 
 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
                                 int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 72036c2d4..54ec67f24 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,49 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
author	Scott LaVarnway <slavarnway@google.com>	2013-05-20 13:03:17 -0400
committer	Scott LaVarnway <slavarnway@google.com>	2013-05-20 13:03:17 -0400
commit	ba48a11130aa88cf20c2c54e43585968ce49e964 (patch)
tree	6a2b02175ec2efc92b5d8a4dca42e425fa70ba3b /vp9/decoder
parent	9aa37a51b28596137ba6fbcb1411c070287d6e11 (diff)
download	libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.gz libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.bz2 libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.zip