6 files changed, 97 insertions, 130 deletions
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 3ec093f73..5e6384c7a 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1249,10 +1249,9 @@ static void idct32_1d(int16_t *input, int16_t *output) {
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1269,7 +1268,8 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -1279,10 +1279,10 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1302,6 +1302,7 @@ void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 02d32530a..c45d03084 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -94,9 +94,6 @@ specialize vp9_add_residual_8x8 sse2
 prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_16x16 sse2
 
-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -212,15 +209,14 @@ specialize vp9_short_idct10_16x16 sse2
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
 
-
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
 
 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index dd7e68aa3..e53a937f4 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1319,8 +1319,7 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -1832,6 +1831,8 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
+      const __m128i zero = _mm_setzero_si128();
+
       // 2_D: Calculate the results and store them to destination.
       in0 = _mm_add_epi16(stp1_0, stp1_31);
       in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1934,50 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
 
-      output += 8;
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
     }
   }
 }
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 3480df25e..faaee7378 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -109,10 +109,6 @@ void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
   add_residual(diff, dest, stride, 16, 16);
 }
 
-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 32, 32);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -321,20 +317,16 @@ void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       input[0] = 0;
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_32x32(input, output, 64);
-
+      vp9_short_idct10_32x32_add_c(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[32] = input[33] = input[34] = 0;
       input[64] = input[65] = 0;
       input[96] = 0;
 
-      vp9_add_residual_32x32(output, dest, stride);
 #endif
     } else {
-      vp9_short_idct32x32(input, output, 64);
+      vp9_short_idct32x32_add(input, dest, stride);
       vpx_memset(input, 0, 2048);
-      vp9_add_residual_32x32(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 1296b704a..38fd5aaa4 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -181,65 +181,6 @@ void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
   } while (--i);
 }
 
-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 32;
-  int i = 16;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
-    diff += 2 * width;
-    dest += 2 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index e4002d689..db18555f9 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,6 +425,7 @@ struct encode_b_args {
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
+  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -493,6 +494,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
+  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -500,6 +502,10 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                   raster_block,
                                                   xd->plane[plane].diff);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 xd->plane[plane].dst.stride);
   TX_TYPE tx_type = DCT_DCT;
 
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@@ -509,8 +515,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   switch (ss_txfrm_size / 2) {
     case TX_32X32:
-      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                          diff, bw * 2);
+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -521,6 +528,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                            diff, bw, tx_type);
       }
+      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -531,6 +539,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -544,6 +553,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -551,7 +561,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      xform_quant, &arg);
@@ -560,7 +570,7 @@ void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -569,7 +579,8 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -577,15 +588,16 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-
-  vp9_recon_sby(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
@@ -593,20 +605,35 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  vp9_recon_sbuv(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 
   vp9_recon_sb(xd, bsize);
+#else
+  // wip version... will use foreach_transformed_block when done
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
+  wip_txfrm_size = 0;
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
+#endif
 }