7 files changed, 107 insertions, 69 deletions
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 67cfc9d71..632dae8fd 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -122,7 +122,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
   }
 }
 
-void idct4_1d(int16_t *input, int16_t *output) {
+static void idct4_1d(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -200,7 +200,7 @@ void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
   }
 }
 
-void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -320,10 +320,9 @@ static const transform_2d IHT_4[] = {
 
 void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[4], temp_out[4];
   const transform_2d ht = IHT_4[tx_type];
 
@@ -340,7 +339,7 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 4 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
 }
 
@@ -430,10 +429,9 @@ static const transform_2d IHT_8[] = {
 
 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];
 
@@ -450,7 +448,7 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
   }
 }
 
@@ -486,7 +484,7 @@ void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 5);
 }
 
-void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -853,18 +851,17 @@ static const transform_2d IHT_16[] = {
 };
 
 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+                          int input_pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];
 
   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr);
-    input += half_pitch;
+    input += input_pitch;
     outptr += 16;
   }
 
@@ -912,7 +909,7 @@ void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index c6b961894..d431ea24b 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -25,8 +25,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
   for (i = 0; i < 16; i++) {
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff,
-                       32, tx_type);
+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_4x4(xd, i, 32);
     }
@@ -58,8 +57,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   for (i = 0; i < 9; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff,
-                           32, tx_type);
+      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -69,7 +67,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
       vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
-                           32, tx_type);
+                           16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -101,7 +99,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
   BLOCKD *bd = &xd->block[0];
   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
   if (tx_type != DCT_DCT) {
-    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 32, tx_type);
+    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);
   } else {
     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                   &xd->block[0].diff[0], 32);
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 0e205bdda..1d6c66afd 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -65,7 +65,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
   for (i = 0; i < 16; i++)
     input[i] = dq[i] * input[i];
 
-  vp9_short_iht4x4(input, output, 8, tx_type);
+  vp9_short_iht4x4(input, output, 4, tx_type);
   vpx_memset(input, 0, 32);
 
   add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
@@ -86,7 +86,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
     for (i = 1; i < 64; i++)
       input[i] *= dq[1];
 
-    vp9_short_iht8x8(input, output, 16, tx_type);
+    vp9_short_iht8x8(input, output, 8, tx_type);
     vpx_memset(input, 0, 128);
 
     add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
@@ -247,7 +247,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
       input[i] *= dq[1];
 
     // inverse hybrid transform
-    vp9_short_iht16x16(input, output, 32, tx_type);
+    vp9_short_iht16x16(input, output, 16, tx_type);
 
     // the idct halves ( >> 1) the pitch
     // vp9_short_idct16x16_c(input, output, 32);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index e2f3e2677..9c2203dea 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -105,7 +105,6 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
   int16_t out[4 * 4];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -137,7 +136,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 4;
+      temp_in[j] = input[j * pitch + i] << 4;
     if (i == 0 && temp_in[0])
       temp_in[0] += 1;
     fwdc(temp_in, temp_out);
@@ -308,7 +307,6 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
   int16_t out[64];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
@@ -339,7 +337,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 2;
+      temp_in[j] = input[j * pitch + i] << 2;
     fwdc(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       outptr[j * 8 + i] = temp_out[j];
@@ -697,7 +695,6 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
                           int pitch, TX_TYPE tx_type) {
   int16_t out[256];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -728,7 +725,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 2;
+      temp_in[j] = input[j * pitch + i] << 2;
     fwdc(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index ef64db1db..43bb4640c 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -54,9 +54,9 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
 
   tx_type = get_tx_type_4x4(&x->e_mbd, b);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
     vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
   } else {
     x->fwd_txm4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
@@ -149,10 +149,10 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 
     tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 32, tx_type);
+      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
       vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
-                            32, tx_type);
+                            16, tx_type);
     } else {
       x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
@@ -164,9 +164,9 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
       be = &x->block[ib + iblock[i]];
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
         vp9_ht_quantize_b_4x4(be, b, tx_type);
-        vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
         x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(be, be + 1, b, b + 1);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 61516ddec..a753bf40f 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -174,7 +174,7 @@ void vp9_transform_mby_4x4(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(b->src_diff, b->coeff, 32, tx_type);
+      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);
     } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) {
       x->fwd_txm8x4(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
@@ -209,7 +209,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, b->coeff, 32, tx_type);
+      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);
     } else {
       x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
@@ -219,7 +219,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 32, tx_type);
+      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);
     } else {
       x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i + 2].coeff[0], 32);
@@ -247,7 +247,7 @@ void vp9_transform_mby_16x16(MACROBLOCK *x) {
   TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
   vp9_clear_system_state();
   if (tx_type != DCT_DCT) {
-    vp9_short_fht16x16(b->src_diff, b->coeff, 32, tx_type);
+    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);
   } else {
     x->fwd_txm16x16(&x->block[0].src_diff[0],
                            &x->block[0].coeff[0], 32);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d26b5ae7b..6e1122f3e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -402,6 +402,12 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+#if CONFIG_CNVCONTEXT
+  ENTROPY_CONTEXT *const a1 = a +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+  ENTROPY_CONTEXT *const l1 = l +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+#endif
 
   switch (tx_size) {
     case TX_4X4:
@@ -416,6 +422,10 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       }
       break;
     case TX_8X8:
+#if CONFIG_CNVCONTEXT
+      a_ec = (a[0] + a[1]) != 0;
+      l_ec = (l[0] + l[1]) != 0;
+#endif
       scan = vp9_default_zig_zag1d_8x8;
       seg_eob = 64;
       break;
@@ -425,12 +435,27 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       if (type == PLANE_TYPE_UV) {
         const int uv_idx = ib - 16;
         qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;
+#if CONFIG_CNVCONTEXT
+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
+#endif
       }
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       seg_eob = 1024;
       qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+#if CONFIG_CNVCONTEXT
+      a_ec = a[0] + a[1] + a[2] + a[3] +
+      a1[0] + a1[1] + a1[2] + a1[3];
+      l_ec = l[0] + l[1] + l[2] + l[3] +
+      l1[0] + l1[1] + l1[2] + l1[3];
+      a_ec = a_ec != 0;
+      l_ec = l_ec != 0;
+#endif
       break;
     default:
       abort();
@@ -459,6 +484,22 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   // is eob first coefficient;
   pt = (c > 0);
   *a = *l = pt;
+#if CONFIG_CNVCONTEXT
+  if (tx_size >= TX_8X8) {
+    a[1] = l[1] = pt;
+    if (tx_size >= TX_16X16) {
+      if (type == PLANE_TYPE_UV) {
+        a1[0] = a1[1] = l1[0] = l1[1] = pt;
+      } else {
+        a[2] = a[3] = l[2] = l[3] = pt;
+        if (tx_size >= TX_32X32) {
+          a1[0] = a1[1] = a1[2] = a1[3] = pt;
+          l1[0] = l1[1] = l1[2] = l1[3] = pt;
+        }
+      }
+    }
+  }
+#endif
   return cost;
 }
 
@@ -701,15 +742,15 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
 
 static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
   MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
     ta = (ENTROPY_CONTEXT *) &t_above,
     tl = (ENTROPY_CONTEXT *) &t_left;
 
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES) * 2);
   } else {
     ta = (ENTROPY_CONTEXT *) xd->above_context;
     tl = (ENTROPY_CONTEXT *) xd->left_context;
@@ -1013,7 +1054,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     b->bmi.as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, b);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
       vp9_ht_quantize_b_4x4(be, b, tx_type);
     } else {
       x->fwd_txm4x4(be->src_diff, be->coeff, 32);
@@ -1046,7 +1087,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, b->diff, 32, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);
   else
     xd->inv_txm4x4(best_dqcoeff, b->diff, 32);
 
@@ -1279,8 +1320,9 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   int distortion = 0, rate = 0;
   BLOCK  *be = x->block + ib;
   BLOCKD *b = xd->block + ib;
-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
+  ENTROPY_CONTEXT_PLANES ta, tl;
+  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;
+  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;
 
   /*
    * The predictor buffer is a 2d buffer with a stride of 16.  Create
@@ -1309,7 +1351,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
       TX_TYPE tx_type = get_tx_type_8x8(xd, b);
       if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 32, tx_type);
+        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       else
         x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
@@ -1317,23 +1359,29 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       // compute quantization mse of 8x8 block
       distortion = vp9_block_error_c((x->block + idx)->coeff,
                                      (xd->block + idx)->dqcoeff, 64);
-      ta0 = a[vp9_block2above[TX_8X8][idx]];
-      tl0 = l[vp9_block2left[TX_8X8][idx]];
+
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
 
       rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
-                           &ta0, &tl0, TX_8X8);
+                           ta0, tl0, TX_8X8);
 
       rate += rate_t;
-      ta1 = ta0;
-      tl1 = tl0;
     } else {
       static const int iblock[4] = {0, 1, 4, 5};
       TX_TYPE tx_type;
       int i;
-      ta0 = a[vp9_block2above[TX_4X4][ib]];
-      ta1 = a[vp9_block2above[TX_4X4][ib + 1]];
-      tl0 = l[vp9_block2left[TX_4X4][ib]];
-      tl1 = l[vp9_block2left[TX_4X4][ib + 4]];
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
       distortion = 0;
       rate_t = 0;
       for (i = 0; i < 4; ++i) {
@@ -1342,7 +1390,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         be = &x->block[ib + iblock[i]];
         tx_type = get_tx_type_4x4(xd, b);
         if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
           vp9_ht_quantize_b_4x4(be, b, tx_type);
         } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
           x->fwd_txm8x4(be->src_diff, be->coeff, 32);
@@ -1354,15 +1402,13 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
         rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
-                              // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                              &ta0, &tl0,
+                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
         if (do_two) {
+          i++;
           rate_t += cost_coeffs(x, b + 1, PLANE_TYPE_Y_WITH_DC,
-                                // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                                &ta0, &tl0,
+                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                                 TX_4X4);
-          i++;
         }
       }
       b = &xd->block[ib];
@@ -1376,10 +1422,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       *bestrate = rate;
       *bestratey = rate_t;
       *bestdistortion = distortion;
-      besta0 = ta0;
-      besta1 = ta1;
-      bestl0 = tl0;
-      bestl1 = tl1;
+      besta0 = *ta0;
+      besta1 = *ta1;
+      bestl0 = *tl0;
+      bestl1 = *tl1;
       best_rd = this_rd;
       *best_mode = mode;
       copy_predictor_8x8(best_predictor, b->predictor);
@@ -1532,12 +1578,12 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
 
     ta = (ENTROPY_CONTEXT *) &t_above;
     tl = (ENTROPY_CONTEXT *) &t_left;