WIP: 8x8 idct/recon merge

This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: Iacfd57324fbe2b7beca5d7f3dcae25c976e67f45
author: Scott LaVarnway <slavarnway@google.com> 2013-05-16 13:52:15 -0400
committer: Scott LaVarnway <slavarnway@google.com> 2013-05-16 13:52:15 -0400
commit: 794a7bedbd43fe062c0e11308938f9793f2facb1 (patch)
tree: efede5807ab73c2bcc8209802230ebe40fd64427 /vp9
parent: a272ff25cd99f47950dddb55e94b370e95b70016 (diff)
download: libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar
libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar.gz
libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar.bz2
libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.zip
6 files changed, 60 insertions, 142 deletions
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index b166fcbba..2ff7696f8 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -219,27 +219,27 @@ static void idct8_1d(int16_t *input, int16_t *output) {
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
-  // Rows
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     idct8_1d(input, outptr);
     input += 8;
     outptr += 8;
   }
 
-  // Columns
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -400,8 +400,8 @@ static const transform_2d IHT_8[] = {
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -421,14 +421,14 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
-  }
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+                                int dest_stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
@@ -447,7 +447,8 @@ void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ea60fbb10..5ecb0af44 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -88,9 +88,6 @@ if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_4x4 sse2
 
-prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_8x8 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -188,11 +185,11 @@ specialize vp9_short_idct4x4_1
 prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct4x4 sse2
 
-prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct8x8 sse2
+prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_add sse2
 
-prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_8x8 sse2
+prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_8x8_add sse2
 
 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_8x8
@@ -215,8 +212,8 @@ specialize vp9_short_idct1_32x32
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add
 
-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht8x8
+prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht8x8_add
 
 prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht4x4
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 667da3369..ab8604c75 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -403,8 +403,18 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
   in6 = _mm_subs_epi16(stp1_1, stp1_6); \
   in7 = _mm_subs_epi16(stp1_0, stp2_7);
 
-void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
+
+void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -461,19 +471,17 @@ void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  // Store results
-  _mm_store_si128((__m128i *)output, in0);
-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+  RECON_AND_STORE(dest, in0);
+  RECON_AND_STORE(dest, in1);
+  RECON_AND_STORE(dest, in2);
+  RECON_AND_STORE(dest, in3);
+  RECON_AND_STORE(dest, in4);
+  RECON_AND_STORE(dest, in5);
+  RECON_AND_STORE(dest, in6);
+  RECON_AND_STORE(dest, in7);
 }
 
-void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -612,15 +620,14 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  // Store results
-  _mm_store_si128((__m128i *)output, in0);
-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+  RECON_AND_STORE(dest, in0);
+  RECON_AND_STORE(dest, in1);
+  RECON_AND_STORE(dest, in2);
+  RECON_AND_STORE(dest, in3);
+  RECON_AND_STORE(dest, in4);
+  RECON_AND_STORE(dest, in5);
+  RECON_AND_STORE(dest, in6);
+  RECON_AND_STORE(dest, in7);
 }
 
 #define IDCT16x16_1D \
@@ -752,16 +759,6 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
-      dest += stride; \
-  }
-
 void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index bc943fa85..10b585b3f 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -101,10 +101,6 @@ void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
   add_residual(diff, dest, stride, 4, 4);
 }
 
-void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 8, 8);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -151,11 +147,8 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
     vp9_idct_add_8x8(input, dest, stride, eob);
   } else {
     if (eob > 0) {
-      DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
-      vp9_short_iht8x8(input, output, 8, tx_type);
+      vp9_short_iht8x8_add(input, dest, stride, tx_type);
       vpx_memset(input, 0, 128);
-      vp9_add_residual_8x8(output, dest, stride);
     }
   }
 }
@@ -210,8 +203,6 @@ void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
 }
 
 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -233,20 +224,15 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       vp9_add_constant_residual_8x8(out, dest, stride);
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      vp9_short_idct10_8x8(input, output, 16);
-
+      vp9_short_idct10_8x8_add(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[8] = input[9] = input[10] = 0;
       input[16] = input[17] = 0;
       input[24] = 0;
-
-      vp9_add_residual_8x8(output, dest, stride);
 #endif
     } else {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct8x8(input, output, 8 << 1);
+      vp9_short_idct8x8_add(input, dest, stride);
       vpx_memset(input, 0, 128);
-      vp9_add_residual_8x8(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 796fc123c..72036c2d4 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -58,70 +58,6 @@ void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
   *(int *)dest = _mm_cvtsi128_si32(p2);
 }
 
-void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-  const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
-  const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
-  const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
-  const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-  p4 = _mm_unpacklo_epi8(p4, zero);
-  p5 = _mm_unpacklo_epi8(p5, zero);
-  p6 = _mm_unpacklo_epi8(p6, zero);
-  p7 = _mm_unpacklo_epi8(p7, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-  p4 = _mm_add_epi16(p4, d4);
-  p5 = _mm_add_epi16(p5, d5);
-  p6 = _mm_add_epi16(p6, d6);
-  p7 = _mm_add_epi16(p7, d7);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-  p4 = _mm_packus_epi16(p4, p5);
-  p6 = _mm_packus_epi16(p6, p7);
-
-  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
-  p0 = _mm_srli_si128(p0, 8);
-  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
-  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
-  p2 = _mm_srli_si128(p2, 8);
-  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
-  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
-  p4 = _mm_srli_si128(p4, 8);
-  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
-  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
-  p6 = _mm_srli_si128(p6, 8);
-  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 221de7426..bbc97da61 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -534,11 +534,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
-        vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                          diff, bw * 2);
+        vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                              block, 16), dst, xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         diff, bw, tx_type);
+        vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                             block, 16), dst, xd->plane[plane].dst.stride,
+                             tx_type);
       }
       *wip_txfrm_size = 8;
       break;
@@ -589,7 +590,7 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 8)
     vp9_recon_sby(xd, bsize);
 }
 
@@ -606,7 +607,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 16)
+  if (wip_txfrm_size < 8)
     vp9_recon_sbuv(xd, bsize);
 }
 
@@ -628,13 +629,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
   // wip version... will use foreach_transformed_block when done
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-  if (wip_txfrm_size < 16)
+  if (wip_txfrm_size < 8)
     vp9_recon_sby(xd, bsize);
   wip_txfrm_size = 0;
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 16)
+  if (wip_txfrm_size < 8)
     vp9_recon_sbuv(xd, bsize);
 #endif
 }
author	Scott LaVarnway <slavarnway@google.com>	2013-05-16 13:52:15 -0400
committer	Scott LaVarnway <slavarnway@google.com>	2013-05-16 13:52:15 -0400
commit	794a7bedbd43fe062c0e11308938f9793f2facb1 (patch)
tree	efede5807ab73c2bcc8209802230ebe40fd64427 /vp9
parent	a272ff25cd99f47950dddb55e94b370e95b70016 (diff)
download	libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar.gz libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.tar.bz2 libvpx-794a7bedbd43fe062c0e11308938f9793f2facb1.zip