7 files changed, 303 insertions, 104 deletions
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index 8c03de7cd..dc483a01e 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -612,3 +612,180 @@ cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
   mova  [dstq+stride3q ], m4
   RESTORE_GOT
   RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+  mova                  m0, [leftq]
+  movu                  m7, [aboveq-1]
+  movu                  m1, [aboveq+15]
+
+  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
+
+  palignr               m3, m1, m7, 1
+  palignr               m5, m1, m7, 2
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
+
+  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
+  palignr               m5, m0, m7, 15
+  palignr               m3, m0, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
+  pavgb                 m5, m0                            ; A1 - Ag
+  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
+  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
+  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
+
+  DEFINE_ARGS dst, stride, stride3, left, line
+  lea             stride3q, [strideq*3]
+
+  palignr               m5, m2, m1, 14
+  palignr               m7, m1, m6, 14
+  mova  [dstq            ], m7
+  mova  [dstq+16         ], m5
+  palignr               m5, m2, m1, 12
+  palignr               m7, m1, m6, 12
+  mova  [dstq+strideq    ], m7
+  mova  [dstq+strideq+16 ], m5
+  palignr                m5, m2, m1, 10
+  palignr                m7, m1, m6, 10
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m2, m1, 8
+  palignr                m7, m1, m6, 8
+  mova  [dstq+stride3q    ], m7
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m2, m1, 6
+  palignr                m7, m1, m6, 6
+  mova  [dstq             ], m7
+  mova  [dstq+16          ], m5
+  palignr                m5, m2, m1, 4
+  palignr                m7, m1, m6, 4
+  mova  [dstq+strideq     ], m7
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m2, m1, 2
+  palignr                m7, m1, m6, 2
+  mova  [dstq+strideq*2   ], m7
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m6
+  mova  [dstq+stride3q+16 ], m1
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m5, m1, m6, 14
+  palignr                m3, m6, m4, 14
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 12
+  palignr                m3, m6, m4, 12
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 10
+  palignr                m3, m6, m4, 10
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  palignr                m5, m1, m6, 8
+  palignr                m3, m6, m4, 8
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m5
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m5, m1, m6, 6
+  palignr                m3, m6, m4, 6
+  mova  [dstq             ], m3
+  mova  [dstq+16          ], m5
+  palignr                m5, m1, m6, 4
+  palignr                m3, m6, m4, 4
+  mova  [dstq+strideq     ], m3
+  mova  [dstq+strideq+16  ], m5
+  palignr                m5, m1, m6, 2
+  palignr                m3, m6, m4, 2
+  mova  [dstq+strideq*2   ], m3
+  mova  [dstq+strideq*2+16], m5
+  mova  [dstq+stride3q    ], m4
+  mova  [dstq+stride3q+16 ], m6
+  lea               dstq, [dstq+strideq*4]
+
+  mova                   m7, [leftq]
+  mova                   m3, [leftq+16]
+  palignr                m5, m3, m7, 15
+  palignr                m0, m3, m7, 14
+
+  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
+  pavgb                  m5, m3                            ; Ah -
+  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
+  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
+  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
+  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
+
+  palignr                m7, m6, m4, 14
+  palignr                m0, m4, m3, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 12
+  palignr                m0, m4, m3, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 10
+  palignr                m0, m4, m3, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m6, m4, 8
+  palignr                m0, m4, m3, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m6, m4, 6
+  palignr                m0, m4, m3, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m6, m4, 4
+  palignr                m0, m4, m3, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m6, m4, 2
+  palignr                m0, m4, m3, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m3
+  mova  [dstq+stride3q+16 ], m4
+  lea                  dstq, [dstq+strideq*4]
+
+  palignr                m7, m4, m3, 14
+  palignr                m0, m3, m2, 14
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 12
+  palignr                m0, m3, m2, 12
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 10
+  palignr                m0, m3, m2, 10
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  palignr                m7, m4, m3, 8
+  palignr                m0, m3, m2, 8
+  mova  [dstq+stride3q    ], m0
+  mova  [dstq+stride3q+16 ], m7
+  lea                  dstq, [dstq+strideq*4]
+  palignr                m7, m4, m3, 6
+  palignr                m0, m3, m2, 6
+  mova  [dstq             ], m0
+  mova  [dstq+16          ], m7
+  palignr                m7, m4, m3, 4
+  palignr                m0, m3, m2, 4
+  mova  [dstq+strideq     ], m0
+  mova  [dstq+strideq+16  ], m7
+  palignr                m7, m4, m3, 2
+  palignr                m0, m3, m2, 2
+  mova  [dstq+strideq*2   ], m0
+  mova  [dstq+strideq*2+16], m7
+  mova  [dstq+stride3q    ], m2
+  mova  [dstq+stride3q+16 ], m3
+
+  RESTORE_GOT
+  RET
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7619d76d4..2f59d333a 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -388,8 +388,8 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
            mi->ref_frame[0]);
   }
 
-  // if using the prediction mdoel we have nothing further to do because
-  // the reference frame is fully coded by the segment
+  // If using the prediction model we have nothing further to do because
+  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6314b6009..a63bd1b8a 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -178,4 +178,23 @@ struct macroblock {
                          int y_blocks);
 };
 
+struct rdcost_block_args {
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int this_rate;
+  int64_t this_dist;
+  int64_t this_sse;
+  int64_t this_rd;
+  int64_t best_rd;
+  int skip;
+  const int16_t *scan, *nb;
+};
+
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 27e4cd07f..3008e46dd 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -17,7 +17,7 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 
-static void fdct4_1d(int16_t *input, int16_t *output) {
+static void fdct4(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
 
@@ -102,7 +102,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst4_1d(int16_t *input, int16_t *output) {
+static void fadst4(int16_t *input, int16_t *output) {
   int x0, x1, x2, x3;
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -143,10 +143,10 @@ static void fadst4_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_4[] = {
-  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0
-  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1
-  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2
-  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3
+  { fdct4,  fdct4  },  // DCT_DCT  = 0
+  { fadst4, fdct4  },  // ADST_DCT = 1
+  { fdct4,  fadst4 },  // DCT_ADST = 2
+  { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
@@ -183,7 +183,7 @@ void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
     vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
 }
 
-static void fdct8_1d(int16_t *input, int16_t *output) {
+static void fdct8(int16_t *input, int16_t *output) {
   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   /*needs32*/ int t0, t1, t2, t3;
   /*canbe16*/ int x0, x1, x2, x3;
@@ -198,7 +198,7 @@ static void fdct8_1d(int16_t *input, int16_t *output) {
   s6 = input[1] - input[6];
   s7 = input[0] - input[7];
 
-  // fdct4_1d(step, step);
+  // fdct4(step, step);
   x0 = s0 + s3;
   x1 = s1 + s2;
   x2 = s1 - s2;
@@ -259,7 +259,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
       s6 = (input[1 * stride] - input[6 * stride]) * 4;
       s7 = (input[0 * stride] - input[7 * stride]) * 4;
 
-      // fdct4_1d(step, step);
+      // fdct4(step, step);
       x0 = s0 + s3;
       x1 = s1 + s2;
       x2 = s1 - s2;
@@ -301,7 +301,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+    fdct8(&intermediate[i * 8], &final_output[i * 8]);
     for (j = 0; j < 8; ++j)
       final_output[j + i * 8] /= 2;
   }
@@ -368,7 +368,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
         step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
         step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
       }
-      // Work on the first eight values; fdct8_1d(input, even_results);
+      // Work on the first eight values; fdct8(input, even_results);
       {
         /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
         /*needs32*/ int t0, t1, t2, t3;
@@ -384,7 +384,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
         s6 = input[1] - input[6];
         s7 = input[0] - input[7];
 
-        // fdct4_1d(step, step);
+        // fdct4(step, step);
         x0 = s0 + s3;
         x1 = s1 + s2;
         x2 = s1 - s2;
@@ -486,7 +486,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst8_1d(int16_t *input, int16_t *output) {
+static void fadst8(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -558,10 +558,10 @@ static void fadst8_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_8[] = {
-  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0
-  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1
-  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2
-  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3
+  { fdct8,  fdct8  },  // DCT_DCT  = 0
+  { fadst8, fdct8  },  // ADST_DCT = 1
+  { fdct8,  fadst8 },  // DCT_ADST = 2
+  { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
@@ -654,7 +654,7 @@ void vp9_short_walsh8x4_c(int16_t *input, int16_t *output, int pitch) {
 
 
 // Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+static void fdct16(int16_t in[16], int16_t out[16]) {
   /*canbe16*/ int step1[8];
   /*canbe16*/ int step2[8];
   /*canbe16*/ int step3[8];
@@ -680,7 +680,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
   step1[6] = in[1] - in[14];
   step1[7] = in[0] - in[15];
 
-  // fdct8_1d(step, step);
+  // fdct8(step, step);
   {
     /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
     /*needs32*/ int t0, t1, t2, t3;
@@ -696,7 +696,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
     s6 = input[1] - input[6];
     s7 = input[0] - input[7];
 
-    // fdct4_1d(step, step);
+    // fdct4(step, step);
     x0 = s0 + s3;
     x1 = s1 + s2;
     x2 = s1 - s2;
@@ -795,7 +795,7 @@ static void fdct16_1d(int16_t in[16], int16_t out[16]) {
   out[15] = dct_const_round_shift(temp2);
 }
 
-void fadst16_1d(int16_t *input, int16_t *output) {
+void fadst16(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -958,10 +958,10 @@ void fadst16_1d(int16_t *input, int16_t *output) {
 }
 
 static const transform_2d FHT_16[] = {
-  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0
-  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1
-  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2
-  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3
+  { fdct16,  fdct16  },  // DCT_DCT  = 0
+  { fadst16, fdct16  },  // ADST_DCT = 1
+  { fdct16,  fadst16 },  // DCT_ADST = 2
+  { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
 void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b271f55b9..d706ec71b 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -314,6 +314,7 @@ typedef struct VP9_COMP {
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
+  struct rdcost_block_args rdcost_stack;
 
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index dce73ee71..5ba3ec8ad 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -535,25 +535,6 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   return cost;
 }
 
-struct rdcost_block_args {
-  MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
-  TX_SIZE tx_size;
-  int bw;
-  int bh;
-  int rate[256];
-  int64_t dist[256];
-  int64_t sse[256];
-  int this_rate;
-  int64_t this_dist;
-  int64_t this_sse;
-  int64_t this_rd;
-  int64_t best_rd;
-  int skip;
-  const int16_t *scan, *nb;
-};
-
 static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   const int ss_txfrm_size = tx_size << 1;
   struct rdcost_block_args* args = arg;
@@ -565,17 +546,17 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
   int shift = args->tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                &this_sse) >> shift;
-  args->sse[block]  = this_sse >> shift;
+  args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                               &this_sse) >> shift;
+  args->sse  = this_sse >> shift;
 
   if (x->skip_encode &&
       xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
-    args->dist[block] += (p >> 4);
-    args->sse[block]  += p;
+    args->dist += (p >> 4);
+    args->sse  += p;
   }
 }
 
@@ -586,10 +567,9 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
   int x_idx, y_idx;
   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
 
-  args->rate[block] = cost_coeffs(args->x, plane, block,
-                                  args->t_above + x_idx,
-                                  args->t_left + y_idx, args->tx_size,
-                                  args->scan, args->nb);
+  args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+                           args->t_left + y_idx, args->tx_size,
+                           args->scan, args->nb);
 }
 
 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -610,17 +590,17 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
 
   dist_block(plane, block, tx_size, args);
   rate_block(plane, block, plane_bsize, tx_size, args);
-  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]);
-  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]);
+  rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+  rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
     x->zcoeff_blk[tx_size][block] = rd1 > rd2;
 
-  args->this_rate += args->rate[block];
-  args->this_dist += args->dist[block];
-  args->this_sse  += args->sse[block];
+  args->this_rate += args->rate;
+  args->this_dist += args->dist;
+  args->this_sse  += args->sse;
   args->this_rd += rd;
 
   if (args->this_rd > args->best_rd) {
@@ -662,7 +642,20 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size,
   }
 }
 
+static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
+                              const int num_4x4_w, const int num_4x4_h,
+                              const int64_t ref_rdcost,
+                              struct rdcost_block_args *arg) {
+  vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
+  arg->x = x;
+  arg->tx_size = tx_size;
+  arg->bw = num_4x4_w;
+  arg->bh = num_4x4_h;
+  arg->best_rd = ref_rdcost;
+}
+
 static void txfm_rd_in_plane(MACROBLOCK *x,
+                             struct rdcost_block_args *rd_stack,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
@@ -674,30 +667,29 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
   const uint8_t *band_translate;  // just for the get_scan_and_band call
 
-  struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
-                                    num_4x4_w, num_4x4_h,
-                                    { 0 }, { 0 }, { 0 },
-                                    0, 0, 0, 0, ref_best_rd, 0 };
+  init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
+                    ref_best_rd, rd_stack);
   if (plane == 0)
     xd->this_mi->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
+  vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 
-  get_scan_and_band(xd, tx_size, pd->plane_type, 0, &args.scan, &args.nb,
-                    &band_translate);
+  get_scan_and_band(xd, tx_size, pd->plane_type, 0, &rd_stack->scan,
+                    &rd_stack->nb, &band_translate);
 
-  foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
-  if (args.skip) {
+  foreach_transformed_block_in_plane(xd, bsize, plane,
+                                     block_yrd_txfm, rd_stack);
+  if (rd_stack->skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
     *sse        = INT64_MAX;
     *skippable  = 0;
   } else {
-    *distortion = args.this_dist;
-    *rate       = args.this_rate;
-    *sse        = args.this_sse;
+    *distortion = rd_stack->this_dist;
+    *rate       = rd_stack->this_rate;
+    *sse        = rd_stack->this_sse;
     *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
   }
 }
@@ -725,7 +717,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
   } else {
     mbmi->tx_size = TX_4X4;
   }
-  txfm_rd_in_plane(x, rate, distortion, skip,
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
                    mbmi->tx_size);
   cpi->tx_stepdown_count[0]++;
@@ -909,8 +901,8 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
-  txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
-                   ref_best_rd, 0, bs, mbmi->tx_size);
+  txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
   if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -937,6 +929,7 @@ static void super_block_yrd(VP9_COMP *cpi,
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+  struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
 
   assert(bs == mbmi->sb_type);
   if (mbmi->ref_frame[0] > INTRA_FRAME)
@@ -972,14 +965,16 @@ static void super_block_yrd(VP9_COMP *cpi,
                                   skip, sse, ref_best_rd, bs);
   } else {
     if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                       &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
+                       &s[TX_32X32], &sse[TX_32X32],
+                       ref_best_rd, 0, bs, TX_32X32);
     if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                       &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
+                       &s[TX_16X16], &sse[TX_16X16],
+                       ref_best_rd, 0, bs, TX_16X16);
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
                      &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
                      &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
@@ -1289,7 +1284,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1312,7 +1307,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_txfm_size);
     if (pnrate == INT_MAX)
       goto term;
@@ -1351,7 +1346,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1382,8 +1377,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t this_sse;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(&cpi->common, x, rate_tokenonly,
-                   distortion, skippable, &this_sse, bsize, INT64_MAX);
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+                   skippable, &this_sse, bsize, INT64_MAX);
   *rate = *rate_tokenonly +
           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -3008,7 +3003,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3638,10 +3633,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // values, which actually are bigger than this_rd itself. This can
         // cause negative best_filter_rd[] values, which is obviously silly.
         // Therefore, if filter_cache < ref, we do an adjusted calculation.
-        if (cpi->rd_filter_cache[i] >= ref)
+        if (cpi->rd_filter_cache[i] >= ref) {
           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
-        else  // FIXME(rbultje) do this for comppred also
-          adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+        } else {
+          // FIXME(rbultje) do this for comppsred also
+          //
+          // To prevent out-of-range computation in
+          //    adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
+          // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
+          int tmp = cpi->rd_filter_cache[i] * 256 / ref;
+          adj_rd = (this_rd * tmp) >> 8;
+        }
         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
       }
     }
@@ -4190,7 +4192,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e58debfd8..810fdf51f 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -267,11 +267,11 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   }
 
   if (cfg.g_pass == VPX_RC_FIRST_PASS) {
-    oxcf->allow_lag              = 0;
-    oxcf->lag_in_frames           = 0;
+    oxcf->allow_lag     = 0;
+    oxcf->lag_in_frames = 0;
   } else {
-    oxcf->allow_lag              = (cfg.g_lag_in_frames) > 0;
-    oxcf->lag_in_frames           = cfg.g_lag_in_frames;
+    oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
+    oxcf->lag_in_frames = cfg.g_lag_in_frames;
   }
 
   // VBR only supported for now.
@@ -283,7 +283,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   else if (cfg.rc_end_usage == VPX_Q)
     oxcf->end_usage      = USAGE_CONSTANT_QUALITY;
 
-  oxcf->target_bandwidth        = cfg.rc_target_bitrate;
+  oxcf->target_bandwidth         = cfg.rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
 
   oxcf->best_allowed_q          = cfg.rc_min_quantizer;
@@ -298,7 +298,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->starting_buffer_level   = cfg.rc_buf_initial_sz;
   oxcf->optimal_buffer_level    = cfg.rc_buf_optimal_sz;
 
-  oxcf->two_pass_vbrbias        = cfg.rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrbias         = cfg.rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
 
@@ -314,23 +314,23 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->encode_breakout        =  vp8_cfg.static_thresh;
   oxcf->play_alternate         =  vp8_cfg.enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  vp8_cfg.noise_sensitivity;
-  oxcf->Sharpness             =  vp8_cfg.Sharpness;
+  oxcf->Sharpness              =  vp8_cfg.Sharpness;
 
-  oxcf->two_pass_stats_in        =  cfg.rc_twopass_stats_in;
-  oxcf->output_pkt_list         =  vp8_cfg.pkt_list;
+  oxcf->two_pass_stats_in      =  cfg.rc_twopass_stats_in;
+  oxcf->output_pkt_list        =  vp8_cfg.pkt_list;
 
   oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
-  oxcf->arnr_strength =  vp8_cfg.arnr_strength;
-  oxcf->arnr_type =      vp8_cfg.arnr_type;
+  oxcf->arnr_strength   = vp8_cfg.arnr_strength;
+  oxcf->arnr_type       = vp8_cfg.arnr_type;
 
   oxcf->tuning = vp8_cfg.tuning;
 
   oxcf->tile_columns = vp8_cfg.tile_columns;
-  oxcf->tile_rows = vp8_cfg.tile_rows;
+  oxcf->tile_rows    = vp8_cfg.tile_rows;
 
   oxcf->lossless = vp8_cfg.lossless;
 
-  oxcf->error_resilient_mode = cfg.g_error_resilient;
+  oxcf->error_resilient_mode         = cfg.g_error_resilient;
   oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
 
   oxcf->ss_number_layers = cfg.ss_number_layers;