13 files changed, 153 insertions, 178 deletions
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 4a788edc0..3538c7bd9 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -20,23 +20,24 @@ extern "C" {
 
 #include "acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t */*dst*/,
+void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
              int stride, int /*tx_type*/) {
   vp9_short_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t */*in*/, int16_t *out, uint8_t *dst,
+void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                  int stride, int /*tx_type*/) {
   vp9_short_idct4x4_add_c(out, dst, stride >> 1);
 }
-void fht4x4(int16_t *in, int16_t *out, uint8_t */*dst*/,
+void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
             int stride, int tx_type) {
   vp9_short_fht4x4_c(in, out, stride >> 1, tx_type);
 }
-void iht4x4_add(int16_t */*in*/, int16_t *out, uint8_t *dst,
+void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                 int stride, int tx_type) {
   vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
 }
@@ -77,8 +78,8 @@ class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
 
 TEST_P(FwdTrans4x4Test, SignBiasCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[16];
-  int16_t test_output_block[16];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
   const int pitch = 8;
   int count_sign_block[16][2];
   const int count_test_block = 1000000;
@@ -140,9 +141,10 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
   double total_error = 0;
   const int count_test_block = 1000000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[16];
-    int16_t test_temp_block[16];
-    uint8_t dst[16], src[16];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
 
     for (int j = 0; j < 16; ++j) {
       src[j] = rnd.Rand8();
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 03301a31b..eeae208f2 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,6 +13,7 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx_ports/mem.h"
 
 extern "C" {
 #include "vp9_rtcd.h"
@@ -25,14 +26,16 @@ void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 using libvpx_test::ACMRandom;
 
 namespace {
-void fdct8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+             int stride, int /*tx_type*/) {
   vp9_short_fdct8x8_c(in, out, stride);
 }
-void idct8x8_add(int16_t *in, int16_t *out, uint8_t *dst,
-                 int stride, int tx_type) {
+void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
+                 int stride, int /*tx_type*/) {
   vp9_short_idct8x8_add_c(out, dst, stride >> 1);
 }
-void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/,
+            int stride, int tx_type) {
   // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
   // when we have all inverse dct functions done sse2.
 #if HAVE_SSE2
@@ -41,7 +44,7 @@ void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
   vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
 #endif
 }
-void iht8x8_add(int16_t *in, int16_t *out, uint8_t *dst,
+void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
                 int stride, int tx_type) {
   vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type);
 }
@@ -79,8 +82,8 @@ class FwdTrans8x8Test : public ::testing::TestWithParam<int> {
 
 TEST_P(FwdTrans8x8Test, SignBiasCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int16_t test_input_block[64];
-  int16_t test_output_block[64];
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
   const int pitch = 16;
   int count_sign_block[64][2];
   const int count_test_block = 100000;
@@ -150,9 +153,10 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) {
   double total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
 
     for (int j = 0; j < 64; ++j) {
       src[j] = rnd.Rand8();
@@ -200,9 +204,10 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) {
   double total_error = 0;
   const int count_test_block = 100000;
   for (int i = 0; i < count_test_block; ++i) {
-    int16_t test_input_block[64];
-    int16_t test_temp_block[64];
-    uint8_t dst[64], src[64];
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
 
     for (int j = 0; j < 64; ++j) {
       src[j] = rnd.Rand8() % 2 ? 255 : 0;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index a09f33ed9..0f197e330 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -338,6 +338,7 @@ typedef struct macroblockd {
   signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
   /* 0 = Intra, Last, GF, ARF */
   signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
+
   /* 0 = ZERO_MV, MV */
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
   /* 0 = ZERO_MV, MV */
@@ -404,34 +405,15 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   int bwl = b_width_log2(sb_type);
   int bhl = b_height_log2(sb_type);
   int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
-  int i;
+  char pcvalue[2] = {~(0xe << boffset), ~(0xf <<boffset)};
+
+  assert(MAX(bwl, bhl) <= bsl);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  if ((bwl == bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  } else if ((bwl == bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xf << boffset);
-  }  else if ((bwl < bsl) && (bhl == bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xf << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else if ((bwl < bsl) && (bhl < bsl)) {
-    for (i = 0; i < bs; i++)
-      xd->left_seg_context[i] = ~(0xe << boffset);
-    for (i = 0; i < bs; i++)
-      xd->above_seg_context[i] = ~(0xe << boffset);
-  } else {
-    assert(0);
-  }
+  vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs);
+  vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
 }
 
 static INLINE int partition_plane_context(MACROBLOCKD *xd,
@@ -504,53 +486,25 @@ static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
   return subsize;
 }
 
-// transform mapping
-static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
-  switch (bmode) {
-    case TM_PRED :
-    case D135_PRED :
-      return ADST_ADST;
-
-    case V_PRED :
-    case D117_PRED :
-    case D63_PRED:
-      return ADST_DCT;
-
-    case H_PRED :
-    case D153_PRED :
-    case D27_PRED :
-      return DCT_ADST;
+extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 
-    default:
-      return DCT_DCT;
-  }
-}
-
-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
+static INLINE TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
   MODE_INFO *const mi = xd->mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
     return DCT_DCT;
 
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    return txfm_map(mi->bmi[ib].as_mode.first);
-  } else {
-    assert(mbmi->mode <= TM_PRED);
-    return txfm_map(mbmi->mode);
-  }
+  return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+                       mi->bmi[ib].as_mode.first : mbmi->mode];
 }
 
-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mode <= TM_PRED
-             ? txfm_map(xd->mode_info_context->mbmi.mode)
-             : DCT_DCT;
+static INLINE TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd) {
+  return mode2txfm_map[xd->mode_info_context->mbmi.mode];
 }
 
-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd) {
-  return xd->mode_info_context->mbmi.mode <= TM_PRED
-             ? txfm_map(xd->mode_info_context->mbmi.mode)
-             : DCT_DCT;
+static INLINE TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd) {
+  return  mode2txfm_map[xd->mode_info_context->mbmi.mode];
 }
 
 void vp9_setup_block_dptrs(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 4086bf0e2..2989b9ccc 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -16,6 +16,24 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"
 
+const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D27
+    ADST_DCT,   // D63
+    ADST_ADST,  // TM
+    DCT_DCT,    // NEARESTMV
+    DCT_DCT,    // NEARMV
+    DCT_DCT,    // ZEROMV
+    DCT_DCT     // NEWMV
+};
+
+
 static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
                           int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
@@ -300,6 +318,7 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int bwl_in,
                             TX_SIZE tx_size,
                             int mode,
+                            uint8_t *reference, int ref_stride,
                             uint8_t *predictor, int pre_stride) {
   const int bwl = bwl_in - tx_size;
   const int wmask = (1 << bwl) - 1;
@@ -309,7 +328,7 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
   const int txfm_block_size = 4 << tx_size;
 
   assert(bwl >= 0);
-  vp9_build_intra_predictors(predictor, pre_stride,
+  vp9_build_intra_predictors(reference, ref_stride,
                              predictor, pre_stride,
                              mode,
                              txfm_block_size,
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index f5f5f42c4..e369a7192 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -25,6 +25,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd,
                             int block_idx,
                             int bwl_in,
                             TX_SIZE tx_size,
-                            int mode,
+                            int mode, uint8_t *ref, int ref_stride,
                             uint8_t *predictor, int pre_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index a87cfd3c5..ac8404001 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -261,6 +261,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride,
                           dst, pd->dst.stride);
 
   // Early exit if there are no coefficients
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index e800582dc..4b1ff103a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1531,8 +1531,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
                    0, 0, NULL, NULL );
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  vp9_build_block_offsets(x);
-
   vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   xd->mode_info_context->mbmi.mode = DC_PRED;
@@ -2006,9 +2004,6 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
 }
 
-void vp9_build_block_offsets(MACROBLOCK *x) {
-}
-
 static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index d37bdca36..399196927 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -15,8 +15,6 @@
 struct macroblock;
 struct yv12_buffer_config;
 
-void vp9_build_block_offsets(struct macroblock *x);
-
 void vp9_setup_src_planes(struct macroblock *x,
                           const struct yv12_buffer_config *src,
                           int mb_row, int mb_col);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index ccd84b39c..e13ffbdcd 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -78,7 +78,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
-#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
 
 struct vp9_token_state {
@@ -643,6 +642,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
   vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride,
                           dst, pd->dst.stride);
   vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
                      src, p->src.stride, dst, pd->dst.stride);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 522f89982..d25d78178 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -521,8 +521,6 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   xd->mode_info_context = cm->mi;
 
-  vp9_build_block_offsets(x);
-
   vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_frame_init_quantizer(cpi);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1204ce092..22fd87d1b 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -216,6 +216,7 @@ typedef struct {
   int static_segmentation;
   int comp_inter_joint_search_thresh;
   int adpative_rd_thresh;
+  int skip_encode_sb;
   int use_lastframe_partitioning;
   int use_largest_txform;
   int use_8tap_always;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index dc3536387..833dfff57 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -495,23 +495,26 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 
 static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
                                int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+  struct macroblockd_plane *p = &x->e_mbd.plane[0];
+  const int bw = plane_block_width(bsize, p);
+  const int bh = plane_block_height(bsize, p);
   return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                         16 << (bwl + bhl)) >> shift;
+                         bw * bh) >> shift;
 }
 
 static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
                                 int shift) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   int64_t sum = 0;
   int plane;
 
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
-    const int subsampling = x->e_mbd.plane[plane].subsampling_x +
-                            x->e_mbd.plane[plane].subsampling_y;
+    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
+    const int bw = plane_block_width(bsize, p);
+    const int bh = plane_block_height(bsize, p);
     sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           16 << (bwl + bhl - subsampling));
+                           bw * bh);
   }
+
   return sum >> shift;
 }
 
@@ -645,7 +648,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   int rate = 0;
   int64_t distortion;
   VP9_COMMON *const cm = &cpi->common;
-  const int src_stride = x->plane[0].src.stride;
+  struct macroblock_plane *p = &x->plane[0];
+  struct macroblockd_plane *pd = &xd->plane[0];
+  const int src_stride = p->src.stride;
   uint8_t *src, *dst;
   int16_t *src_diff, *coeff;
 
@@ -679,18 +684,20 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         block = ib + idy * 2 + idx;
         xd->mode_info_context->bmi[block].as_mode.first = mode;
         src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        x->plane[0].src.buf, src_stride);
+                                        p->src.buf, src_stride);
         src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                             x->plane[0].src_diff);
+                                             p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
         dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[0].dst.stride);
+                                        pd->dst.buf,
+                                        pd->dst.stride);
         vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
-                                TX_4X4, mode, dst, xd->plane[0].dst.stride);
+                                TX_4X4, mode,
+                                dst, pd->dst.stride,
+                                dst, pd->dst.stride);
         vp9_subtract_block(4, 4, src_diff, 8,
                            src, src_stride,
-                           dst, xd->plane[0].dst.stride);
+                           dst, pd->dst.stride);
 
         tx_type = get_tx_type_4x4(xd, block);
         if (tx_type != DCT_DCT) {
@@ -703,15 +710,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
         ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
                              tempa + idx, templ + idy, TX_4X4, 16);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                         block, 16), 16) >> 2;
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
+                                                          block, 16), 16) >> 2;
 
         if (best_tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                               dst, xd->plane[0].dst.stride, best_tx_type);
+          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                               dst, pd->dst.stride, best_tx_type);
         else
-          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
-                             dst, xd->plane[0].dst.stride);
+          xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
+                             dst, pd->dst.stride);
       }
     }
 
@@ -731,7 +738,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         for (idx = 0; idx < bw; ++idx) {
           block = ib + idy * 2 + idx;
           vpx_memcpy(best_dqcoeff[idy * 2 + idx],
-                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     BLOCK_OFFSET(pd->dqcoeff, block, 16),
                      sizeof(best_dqcoeff[0]));
         }
       }
@@ -743,18 +750,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       block = ib + idy * 2 + idx;
       xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
       dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      xd->plane[0].dst.buf,
-                                      xd->plane[0].dst.stride);
+                                      pd->dst.buf,
+                                      pd->dst.stride);
 
       vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
-                              *best_mode, dst, xd->plane[0].dst.stride);
+                              *best_mode, dst, pd->dst.stride,
+                              dst, pd->dst.stride);
       // inverse transform
       if (best_tx_type != DCT_DCT)
         vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                             xd->plane[0].dst.stride, best_tx_type);
+                            pd->dst.stride, best_tx_type);
       else
         xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           xd->plane[0].dst.stride);
+                           pd->dst.stride);
     }
   }
 
@@ -1093,25 +1101,22 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
   int k;
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  const int bw = plane_block_width(bsize, &xd->plane[0]);
+  const int bh = plane_block_height(bsize, &xd->plane[0]);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src.buf, src_stride);
-  int16_t* src_diff =
-  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            x->plane[0].src_diff);
+  uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 x->plane[0].src.buf,
+                                                 src_stride);
+  int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                x->plane[0].src_diff);
   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-  uint8_t* const pre =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].pre[0].buf,
-                            xd->plane[0].pre[0].stride);
-  uint8_t* const dst =
-  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                            xd->plane[0].dst.buf,
-                            xd->plane[0].dst.stride);
+  uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].pre[0].buf,
+                                                 xd->plane[0].pre[0].stride);
+  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                                                 xd->plane[0].dst.buf,
+                                                 xd->plane[0].dst.stride);
   int64_t thisdistortion = 0;
   int thisrate = 0;
 
@@ -1124,7 +1129,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                             xd->plane[0].dst.stride,
                             &xd->mode_info_context->bmi[i].as_mv[0],
                             &xd->scale_factor[0],
-                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix,
+                            bw, bh, 0 /* no avg */, &xd->subpix,
                             MV_PRECISION_Q3);
 
   // TODO(debargha): Make this work properly with the
@@ -1138,17 +1143,17 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
     vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
                               dst, xd->plane[0].dst.stride,
                               &xd->mode_info_context->bmi[i].as_mv[1],
-                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->scale_factor[1], bw, bh, 1,
                               &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+  vp9_subtract_block(bh, bw, src_diff, 8,
                      src, src_stride,
                      dst, xd->plane[0].dst.stride);
 
   k = i;
-  for (idy = 0; idy < bh; ++idy) {
-    for (idx = 0; idx < bw; ++idx) {
+  for (idy = 0; idy < bh / 4; ++idy) {
+    for (idx = 0; idx < bw / 4; ++idx) {
       k += (idy * 2 + idx);
       src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
                                            x->plane[0].src_diff);
@@ -2231,13 +2236,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv *frame_mv,
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES]) {
-  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  const enum BlockSize uv_block_size = get_plane_block_size(bsize,
-                                                            &xd->plane[1]);
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   const int num_refs = is_comp_pred ? 2 : 1;
@@ -2368,13 +2368,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int p;
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
-          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+          struct macroblockd_plane *pd = &xd->plane[p];
+          const int bw = plane_block_width(bsize, pd);
+          const int bh = plane_block_height(bsize, pd);
           int i;
 
-          for (i = 0; i < y; i++)
-            vpx_memcpy(&tmp_buf[p][64 * i],
-                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
+          for (i = 0; i < bh; i++)
+            vpx_memcpy(&tmp_buf[p][64 * i], pd->dst.buf + i * pd->dst.stride,
+                                   bw);
         }
         pred_exists = 1;
       }
@@ -2392,13 +2393,13 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int p;
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
-      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
-      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+      struct macroblockd_plane *pd = &xd->plane[p];
+      const int bw = plane_block_width(bsize, pd);
+      const int bh = plane_block_height(bsize, pd);
       int i;
 
-      for (i = 0; i < y; i++)
-        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
-                   &tmp_buf[p][64 * i], x);
+      for (i = 0; i < bh; i++)
+        vpx_memcpy(pd->dst.buf + i * pd->dst.stride, &tmp_buf[p][64 * i], bw);
     }
   } else {
     // Handles the special case when a filter that is not in the
@@ -2412,36 +2413,37 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
   else if (x->encode_breakout) {
+    const enum BlockSize y_size = get_plane_block_size(bsize, &xd->plane[0]);
+    const enum BlockSize uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+
     unsigned int var, sse;
-    int threshold = (xd->plane[0].dequant[1]
-                     * xd->plane[0].dequant[1] >> 4);
+    int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
+
 
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                     x->plane[0].src.stride,
-                                     xd->plane[0].dst.buf,
-                                     xd->plane[0].dst.stride,
-                                     &sse);
+    var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                                 &sse);
 
     if ((int)sse < threshold) {
       unsigned int q2dc = xd->plane[0].dequant[0];
-      /* If there is no codeable 2nd order dc
-         or a very small uniform pixel change change */
+      // If there is no codeable 2nd order dc
+      // or a very small uniform pixel change change
       if ((sse - var < q2dc * q2dc >> 4) ||
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
         int sse2;
         unsigned int sse2u, sse2v;
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride, &sse2v);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                      x->plane[1].src.stride,
+                                      xd->plane[1].dst.buf,
+                                      xd->plane[1].dst.stride, &sse2u);
+        var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                      x->plane[2].src.stride,
+                                      xd->plane[2].dst.buf,
+                                      xd->plane[2].dst.stride, &sse2v);
         sse2 = sse2u + sse2v;
 
         if (sse2 * 2 < threshold) {
@@ -2449,7 +2451,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           *distortion = sse + sse2;
           *rate2 = 500;
 
-          /* for best_yrd calculation */
+          // for best_yrd calculation
           *rate_uv = 0;
           *distortion_uv = sse2;
 
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 484afce73..cc7d45243 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -375,7 +375,7 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
 }
 
 // load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) {
+static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
   in[0]  = _mm_load_si128((__m128i *)(input + 0 * stride));
   in[1]  = _mm_load_si128((__m128i *)(input + 1 * stride));
   in[2]  = _mm_load_si128((__m128i *)(input + 2 * stride));
@@ -396,7 +396,7 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) {
 }
 
 // write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res) {
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -435,7 +435,7 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
 }
 
 // perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i res[8]) {
+static INLINE void array_transpose_8x8(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
   const __m128i tr0_2 = _mm_unpackhi_epi16(res[0], res[1]);
@@ -486,7 +486,7 @@ static INLINE void array_transpose_8x8(__m128i res[8]) {
   // 07 17 27 37 47 57 67 77
 }
 
-void fdct8_1d_sse2(__m128i in[8]) {
+void fdct8_1d_sse2(__m128i *in) {
   // constants
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -626,7 +626,7 @@ void fdct8_1d_sse2(__m128i in[8]) {
   array_transpose_8x8(in);
 }
 
-void fadst8_1d_sse2(__m128i in[8]) {
+void fadst8_1d_sse2(__m128i *in) {
   // Constants
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);