6 files changed, 77 insertions, 77 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b58945e51..26458e8a2 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -391,8 +391,8 @@ typedef struct macroblockd {
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
   void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
   void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
     struct macroblockd *xd);
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 2ff7696f8..80af49e84 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,12 +18,12 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
+  int16_t output[16];
   int a1, b1, c1, d1;
   int16_t *ip = input;
   int16_t *op = output;
-  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
     a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
@@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
     op[3] = (d1 - c1) >> 1;
 
     ip += 4;
-    op += half_pitch;
+    op += 4;
   }
 
   ip = output;
-  op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0] + ip[4 * 3];
+    b1 = ip[4 * 1] + ip[4 * 2];
+    c1 = ip[4 * 1] - ip[4 * 2];
+    d1 = ip[4 * 0] - ip[4 * 3];
 
 
-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((a1 + b1 + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       ((c1 + d1) >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       ((a1 - b1) >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       ((d1 - c1) >> 1));
 
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;
 
   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
   op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
 
   ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((ip[0] + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       (ip[0] >> 1));
     ip++;
-    op++;
+    dest++;
   }
 }
 
 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                  uint8_t *dst_ptr,
                                  int pitch, int stride) {
-  int r, c;
   int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
 }
 
 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
@@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 4 + i];
     vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
   }
 }
 
@@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   const transform_2d IHT_4[] = {
     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index 01859df5e..d47fca190 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,11 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
   if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
   else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index 2aeb584c9..dbdc50a2a 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,7 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index cf8dd33c4..cb353b1ed 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO
 specialize vp9_intra4x4_predict;
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add
 
-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2
@@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add
 
+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
+
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add
 
-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
-
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add
 
@@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
+
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ab8604c75..599dcff93 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
 
   // Rows
@@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }
+
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
 
-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {