7 files changed, 249 insertions, 564 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index e8c823a59..d30cd4960 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -342,6 +342,7 @@ struct mb_plane {
   DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+  DECLARE_ALIGNED(16, int16_t,  diff[64 * 64]);
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -355,7 +356,6 @@ struct mb_plane {
   BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16))
 
 typedef struct macroblockd {
-  DECLARE_ALIGNED(16, int16_t,  diff[64*64+32*32*2]);      /* from idct diff */
 #if CONFIG_CODE_NONZEROCOUNT
   DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);
 #endif
@@ -878,31 +878,40 @@ typedef void (*foreach_predicted_block_visitor)(int plane, int block,
 static INLINE void foreach_predicted_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
     foreach_predicted_block_visitor visit, void *arg) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+  int i, x, y;
+  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
 
   // block sizes in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  const int block_size_b = bw + bh;
-
   // subsampled size of the block
-  const int ss_sum = xd->plane[plane].subsampling_x +
-                     xd->plane[plane].subsampling_y;
-  const int ss_block_size = block_size_b - ss_sum;
+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bh = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
 
   // size of the predictor to use.
-  // TODO(jkoleszar): support I8X8, I4X4
-  const int pred_w = bw - xd->plane[plane].subsampling_x;
-  const int pred_h = bh - xd->plane[plane].subsampling_y;
-  const int pred_b = mode == SPLITMV ? 0 : pred_w + pred_h;
-  const int step = 1 << pred_b;
-
-  int i;
-
-  assert(pred_b <= block_size_b);
-  assert(pred_b == ss_block_size);
-  for (i = 0; i < (1 << ss_block_size); i += step) {
-    visit(plane, i, bsize, pred_w, pred_h, arg);
+  int pred_w, pred_h;
+
+  if (mode == SPLITMV) {
+    // 4x4 or 8x8
+    const int is_4x4 =
+        (xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4);
+    pred_w = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_x;
+    pred_h = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_y;
+  } else {
+    pred_w = bw;
+    pred_h = bh;
+  }
+  assert(pred_w <= bw);
+  assert(pred_h <= bh);
+
+  // visit each subblock in raster order
+  i = 0;
+  for (y = 0; y < 1 << bh; y += 1 << pred_h) {
+    for (x = 0; x < 1 << bw; x += 1 << pred_w) {
+      visit(plane, i, bsize, pred_w, pred_h, arg);
+      i += 1 << pred_w;
+    }
+    i -= 1 << bw;
+    i += 1 << (bw + pred_h);
   }
 }
 static INLINE void foreach_predicted_block(
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index 3b11fa9cb..0673fd81a 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -38,10 +38,10 @@ void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> bwl;
+    const int offset = x_idx * 32 + y_idx * 32 * stride;
 
     vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 1024),
-                        xd->diff + x_idx * 32 + y_idx * 32 * stride,
-                        stride * 2);
+                        xd->plane[0].diff + offset, stride * 2);
   }
 }
 
@@ -55,15 +55,14 @@ void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
     const int x_idx = n & (bw - 1), y_idx = n >> bwl;
     const TX_TYPE tx_type = get_tx_type_16x16(xd,
                                               (y_idx * bstride + x_idx) * 4);
+    const int offset = x_idx * 16 + y_idx * 16 * stride;
 
     if (tx_type == DCT_DCT) {
       vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
-                                    xd->diff + x_idx * 16 + y_idx * stride * 16,
-                                    stride * 2);
+                                    xd->plane[0].diff + offset, stride * 2);
     } else {
       vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
-                         xd->diff + x_idx * 16 + y_idx * stride * 16,
-                         stride, tx_type);
+                         xd->plane[0].diff + offset, stride, tx_type);
     }
   }
 }
@@ -77,15 +76,14 @@ void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> bwl;
     const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
+    const int offset = x_idx * 8 + y_idx * 8 * stride;
 
     if (tx_type == DCT_DCT) {
       vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
-                                  xd->diff + x_idx * 8 + y_idx * stride * 8,
-                                  stride * 2);
+                                  xd->plane[0].diff + offset, stride * 2);
     } else {
       vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
-                       xd->diff + x_idx * 8 + y_idx * stride * 8,
-                       stride, tx_type);
+                       xd->plane[0].diff + offset, stride, tx_type);
     }
   }
 }
@@ -99,16 +97,15 @@ void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   for (n = 0; n < bw * bh; n++) {
     const int x_idx = n & (bw - 1), y_idx = n >> bwl;
     const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * bstride + x_idx);
+    const int offset = x_idx * 4 + y_idx * 4 * stride;
 
     if (tx_type == DCT_DCT) {
       vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[n],
                                   BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
-                                  xd->diff + x_idx * 4 + y_idx * 4 * stride,
-                                  stride * 2);
+                                  xd->plane[0].diff + offset, stride * 2);
     } else {
       vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
-                       xd->diff + x_idx * 4 + y_idx * 4 * stride,
-                       stride, tx_type);
+                       xd->plane[0].diff + offset, stride, tx_type);
     }
   }
 }
@@ -116,15 +113,12 @@ void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 void vp9_inverse_transform_sbuv_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   assert(bsize == BLOCK_SIZE_SB64X64);
 
-  vp9_short_idct32x32(xd->plane[1].dqcoeff,
-                      xd->diff + 4096, 64);
-  vp9_short_idct32x32(xd->plane[2].dqcoeff,
-                      xd->diff + 4096 + 1024, 64);
+  vp9_short_idct32x32(xd->plane[1].dqcoeff, xd->plane[1].diff, 64);
+  vp9_short_idct32x32(xd->plane[2].dqcoeff, xd->plane[2].diff, 64);
 }
 
 void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
-  const int uoff = (16 * 16) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 16 << (bwl - 1);
   int n;
@@ -134,15 +128,14 @@ void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
     const int off = x_idx * 16 + y_idx * stride * 16;
 
     vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 256),
-                                  xd->diff + uoff + off, stride * 2);
+                                  xd->plane[1].diff + off, stride * 2);
     vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 256),
-                                  xd->diff + voff + off, stride * 2);
+                                  xd->plane[2].diff + off, stride * 2);
   }
 }
 
 void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize) + 1, bhl = mb_height_log2(bsize) + 1;
-  const int uoff = (8 * 8) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 8 << (bwl - 1);
   int n;
@@ -152,15 +145,14 @@ void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
     const int off = x_idx * 8 + y_idx * stride * 8;
 
     vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 64),
-                                xd->diff + uoff + off, stride * 2);
+                                xd->plane[1].diff + off, stride * 2);
     vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 64),
-                                xd->diff + voff + off, stride * 2);
+                                xd->plane[2].diff + off, stride * 2);
   }
 }
 
 void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize) + 2, bhl = mb_height_log2(bsize) + 2;
-  const int uoff = (4 * 4) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
   const int stride = 4 << (bwl - 1);
   int n;
@@ -171,9 +163,9 @@ void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
 
     vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[n],
                                 BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 16),
-                                xd->diff + uoff + off, stride * 2);
+                                xd->plane[1].diff + off, stride * 2);
     vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[n],
                                 BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 16),
-                                xd->diff + voff + off, stride * 2);
+                                xd->plane[2].diff + off, stride * 2);
   }
 }
diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c
index 6ed5f27d9..00fe9aa15 100644
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -77,23 +77,23 @@ void vp9_setup_block_dptrs(MACROBLOCKD *mb) {
     for (c = 0; c < 4; c++) {
       const int to = r * 4 + c;
       const int from = r * 4 * 16 + c * 4;
-      blockd[to].diff = &mb->diff[from];
+      blockd[to].diff = &mb->plane[0].diff[from];
     }
   }
 
   for (r = 0; r < 2; r++) {
     for (c = 0; c < 2; c++) {
       const int to = 16 + r * 2 + c;
-      const int from = 256 + r * 4 * 8 + c * 4;
-      blockd[to].diff = &mb->diff[from];
+      const int from = r * 4 * 8 + c * 4;
+      blockd[to].diff = &mb->plane[1].diff[from];
     }
   }
 
   for (r = 0; r < 2; r++) {
     for (c = 0; c < 2; c++) {
       const int to = 20 + r * 2 + c;
-      const int from = 320 + r * 4 * 8 + c * 4;
-      blockd[to].diff = &mb->diff[from];
+      const int from = r * 4 * 8 + c * 4;
+      blockd[to].diff = &mb->plane[2].diff[from];
     }
   }
 
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 121776c69..fae35844d 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -55,7 +55,7 @@ void vp9_recon_sby_s_c(MACROBLOCKD *mb, uint8_t *dst,
   const int bw = 16 << mb_width_log2(bsize), bh = 16 << mb_height_log2(bsize);
   int x, y;
   const int stride = mb->block[0].dst_stride;
-  const int16_t *diff = mb->diff;
+  const int16_t *diff = mb->plane[0].diff;
 
   for (y = 0; y < bh; y++) {
     for (x = 0; x < bw; x++)
@@ -69,12 +69,11 @@ void vp9_recon_sby_s_c(MACROBLOCKD *mb, uint8_t *dst,
 void vp9_recon_sbuv_s_c(MACROBLOCKD *mb, uint8_t *u_dst, uint8_t *v_dst,
                         BLOCK_SIZE_TYPE bsize) {
   const int bwl = mb_width_log2(bsize), bhl = mb_height_log2(bsize);
-  const int uoff = (16 * 16) << (bwl + bhl), voff = (uoff * 5) >> 2;
   const int bw = 8 << bwl, bh = 8 << bhl;
   int x, y;
   const int stride = mb->block[16].dst_stride;
-  const int16_t *u_diff = mb->diff + uoff;
-  const int16_t *v_diff = mb->diff + voff;
+  const int16_t *u_diff = mb->plane[1].diff;
+  const int16_t *v_diff = mb->plane[2].diff;
 
   for (y = 0; y < bh; y++) {
     for (x = 0; x < bw; x++) {
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 64929c1bc..549993200 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -358,9 +358,6 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
       w, h);
 }
 
-/* Like vp9_build_inter_predictor, but takes the full-pel part of the
- * mv separately, and the fractional part as a q4.
- */
 void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride,
                                   const int_mv *mv_q4,
@@ -438,163 +435,143 @@ static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1,
   }
 }
 
-static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,
-                                      struct scale_factors *s,
-                                      int block_size, int stride,
-                                      int which_mv, int weight,
-                                      const struct subpix_fn_table *subpix,
-                                      int row, int col) {
-  uint8_t *d0_predictor = *(d0->base_dst) + d0->dst;
-  uint8_t *d1_predictor = *(d1->base_dst) + d1->dst;
-  struct scale_factors * scale = &s[which_mv];
-  stride = d0->dst_stride;
-
-  assert(d1_predictor - d0_predictor == block_size);
-  assert(d1->pre == d0->pre + block_size);
-
-  scale->set_scaled_offsets(scale, row, col);
-
-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
-
-    vp9_build_inter_predictor(*base_pre + d0->pre,
-                              d0->pre_stride,
-                              d0_predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              scale,
-                              2 * block_size, block_size,
-                              weight, subpix);
-  } else {
-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
-
-    vp9_build_inter_predictor(*base_pre0 + d0->pre,
-                              d0->pre_stride,
-                              d0_predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              scale,
-                              block_size, block_size,
-                              weight, subpix);
-
-    scale->set_scaled_offsets(scale, row, col + block_size);
+#if !CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 
-    vp9_build_inter_predictor(*base_pre1 + d1->pre,
-                              d1->pre_stride,
-                              d1_predictor, stride,
-                              &d1->bmi.as_mv[which_mv],
-                              scale,
-                              block_size, block_size,
-                              weight, subpix);
-  }
+static INLINE int round_mv_comp_q4(int value) {
+  return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  /* If the MV points so far into the UMV border that no visible pixels
-   * are used for reconstruction, the subpel part of the MV can be
-   * discarded and the MV limited to 16 pixels with equivalent results.
-   *
-   * This limit kicks in at 19 pixels for the top and left edges, for
-   * the 16 pixels plus 3 taps right of the central pixel when subpel
-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
-   * left of the central pixel when filtering.
-   */
-  if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->col = xd->mb_to_left_edge - (16 << 3);
-  else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->col = xd->mb_to_right_edge + (16 << 3);
-
-  if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->row = xd->mb_to_top_edge - (16 << 3);
-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->row = xd->mb_to_bottom_edge + (16 << 3);
+static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int off, int idx) {
+  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;
+  return round_mv_comp_q4(temp);
 }
 
-/* A version of the above function for chroma block MVs.*/
-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  const int extend = VP9_INTERP_EXTEND;
-
-  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + extend) << 3))) ?
-            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
-  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + extend) << 3)) ?
-            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
-
-  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + extend) << 3))) ?
-            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
-  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + extend) << 3)) ?
-            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int off, int idx) {
+  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;
+  return round_mv_comp_q4(temp);
 }
 
-#if !CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 // TODO(jkoleszar): yet another mv clamping function :-(
 MV clamp_mv_to_umv_border_sb(const MV *src_mv,
-    int bwl, int bhl,
+    int bwl, int bhl, int ss_x, int ss_y,
     int mb_to_left_edge, int mb_to_top_edge,
     int mb_to_right_edge, int mb_to_bottom_edge) {
   /* If the MV points so far into the UMV border that no visible pixels
    * are used for reconstruction, the subpel part of the MV can be
    * discarded and the MV limited to 16 pixels with equivalent results.
    */
-  const int epel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 3;
-  const int epel_right = epel_left - (1 << 3);
-  const int epel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 3;
-  const int epel_bottom = epel_top - (1 << 3);
+  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
+  const int spel_right = spel_left - (1 << 4);
+  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
+  const int spel_bottom = spel_top - (1 << 4);
   MV clamped_mv;
-  clamped_mv.col = clamp(src_mv->col,
-                         mb_to_left_edge - epel_left,
-                         mb_to_right_edge + epel_right);
-  clamped_mv.row = clamp(src_mv->row,
-                         mb_to_top_edge - epel_top,
-                         mb_to_bottom_edge + epel_bottom);
+
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+  clamped_mv.col = clamp(src_mv->col << (1 - ss_x),
+                         (mb_to_left_edge << (1 - ss_x)) - spel_left,
+                         (mb_to_right_edge << (1 - ss_x)) + spel_right);
+  clamped_mv.row = clamp(src_mv->row << (1 - ss_y),
+                         (mb_to_top_edge << (1 - ss_y)) - spel_top,
+                         (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
   return clamped_mv;
 }
 
+// TODO(jkoleszar): In principle, nothing has to depend on this, but it's
+// currently required. Some users look at the mi->bmi, some look at the
+// xd->bmi.
+static void duplicate_splitmv_bmi(MACROBLOCKD *xd) {
+  int i;
+
+  for (i = 0; i < 16; i += 2) {
+    xd->block[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
+    xd->block[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
+  }
+}
+
 struct build_inter_predictors_args {
   MACROBLOCKD *xd;
-  uint8_t* dst[MAX_MB_PLANE];
-  int dst_stride[MAX_MB_PLANE];
   int x;
   int y;
+  uint8_t* dst[MAX_MB_PLANE];
+  int dst_stride[MAX_MB_PLANE];
+  uint8_t* pre[2][MAX_MB_PLANE];
+  int pre_stride[2][MAX_MB_PLANE];
 };
 static void build_inter_predictors(int plane, int block,
                                    BLOCK_SIZE_TYPE bsize,
                                    int pred_w, int pred_h,
                                    void *argv) {
   const struct build_inter_predictors_args* const arg = argv;
-  const int bwl = pred_w, bw = 4 << bwl;
-  const int bhl = pred_h, bh = 4 << bhl;
+  MACROBLOCKD * const xd = arg->xd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bh = 4 << bhl,  bw = 4 << bwl;
   const int x_idx = block & ((1 << bwl) - 1), y_idx = block >> bwl;
   const int x = x_idx * 4, y = y_idx * 4;
-  MACROBLOCKD * const xd = arg->xd;
   const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
   int which_mv;
 
+  assert(x < bw);
+  assert(y < bh);
+  assert(xd->mode_info_context->mbmi.mode == SPLITMV || 4 << pred_w == bw);
+  assert(xd->mode_info_context->mbmi.mode == SPLITMV || 4 << pred_h == bh);
+
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const MV* const mv = (xd->mode_info_context->mbmi.mode == SPLITMV)
-         ? &xd->block[block].bmi.as_mv[which_mv].as_mv
-         : &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
-
-    const uint8_t * const base_pre = which_mv ? xd->second_pre.y_buffer
-                                             : xd->pre.y_buffer;
-    const int pre_stride = which_mv ? xd->second_pre.y_stride
-                                    : xd->pre.y_stride;
+    // source
+    const uint8_t * const base_pre = arg->pre[which_mv][plane];
+    const int pre_stride = arg->pre_stride[which_mv][plane];
     const uint8_t *const pre = base_pre +
         scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
     struct scale_factors * const scale =
       plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv];
 
+    // dest
+    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
+
+    // motion vector
+    const MV *mv;
+    MV split_chroma_mv;
     int_mv clamped_mv;
+
+    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+      if (plane == 0) {
+        mv = &xd->block[block].bmi.as_mv[which_mv].as_mv;
+      } else {
+        const int y_block = (block & 2) * 4 + (block & 1) * 2;
+        split_chroma_mv.row = mi_mv_pred_row_q4(xd, y_block, which_mv);
+        split_chroma_mv.col = mi_mv_pred_col_q4(xd, y_block, which_mv);
+        mv = &split_chroma_mv;
+      }
+    } else {
+      mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
+    }
+
+    /* TODO(jkoleszar): This clamping is done in the incorrect place for the
+     * scaling case. It needs to be done on the scaled MV, not the pre-scaling
+     * MV. Note however that it performs the subsampling aware scaling so
+     * that the result is always q4.
+     */
     clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,
+                                                 xd->plane[plane].subsampling_x,
+                                                 xd->plane[plane].subsampling_y,
                                                  xd->mb_to_left_edge,
                                                  xd->mb_to_top_edge,
                                                  xd->mb_to_right_edge,
                                                  xd->mb_to_bottom_edge);
-
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
 
-    vp9_build_inter_predictor(pre, pre_stride,
-                              arg->dst[plane], arg->dst_stride[plane],
-                              &clamped_mv, &xd->scale_factor[which_mv],
-                              bw, bh, which_mv, &xd->subpix);
+    vp9_build_inter_predictor_q4(pre, pre_stride,
+                                 dst, arg->dst_stride[plane],
+                                 &clamped_mv, &xd->scale_factor[which_mv],
+                                 4 << pred_w, 4 << pred_h, which_mv,
+                                 &xd->subpix);
   }
 }
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
@@ -604,16 +581,85 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
                                     int mb_col,
                                     BLOCK_SIZE_TYPE bsize) {
   struct build_inter_predictors_args args = {
-    xd, {dst_y, NULL, NULL}, {dst_ystride, 0, 0}, mb_col * 16, mb_row * 16
+    xd, mb_col * 16, mb_row * 16,
+    {dst_y, NULL, NULL}, {dst_ystride, 0, 0},
+    {{xd->pre.y_buffer, NULL, NULL}, {xd->second_pre.y_buffer, NULL, NULL}},
+    {{xd->pre.y_stride, 0, 0}, {xd->second_pre.y_stride, 0, 0}},
   };
+
+  // TODO(jkoleszar): This is a hack no matter where you put it, but does it
+  // belong here?
+  if (xd->mode_info_context->mbmi.mode == SPLITMV)
+    duplicate_splitmv_bmi(xd);
+
   foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
 }
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
+                                     uint8_t *dst_u,
+                                     uint8_t *dst_v,
+                                     int dst_uvstride,
+                                     int mb_row,
+                                     int mb_col,
+                                     BLOCK_SIZE_TYPE bsize) {
+  struct build_inter_predictors_args args = {
+    xd, mb_col * 16, mb_row * 16,
+    {NULL, dst_u, dst_v}, {0, dst_uvstride, dst_uvstride},
+    {{NULL, xd->pre.u_buffer, xd->pre.v_buffer},
+     {NULL, xd->second_pre.u_buffer, xd->second_pre.v_buffer}},
+    {{0, xd->pre.uv_stride, xd->pre.uv_stride},
+     {0, xd->second_pre.uv_stride, xd->second_pre.uv_stride}},
+  };
+  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
+}
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
+                                   int mb_row, int mb_col,
+                                   BLOCK_SIZE_TYPE bsize) {
+  uint8_t *const y = xd->dst.y_buffer;
+  uint8_t *const u = xd->dst.u_buffer;
+  uint8_t *const v = xd->dst.v_buffer;
+  const int y_stride = xd->dst.y_stride;
+  const int uv_stride = xd->dst.uv_stride;
+
+  vp9_build_inter_predictors_sby(xd, y, y_stride, mb_row, mb_col, bsize);
+  vp9_build_inter_predictors_sbuv(xd, u, v, uv_stride, mb_row, mb_col, bsize);
+#if CONFIG_COMP_INTERINTRA_PRED
+  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    if (bsize == BLOCK_SIZE_SB32X32)
+      vp9_build_interintra_32x32_predictors_sb(xd, y, u, v,
+                                               y_stride, uv_stride);
+    else
+      vp9_build_interintra_64x64_predictors_sb(xd, y, u, v,
+                                               y_stride, uv_stride);
+  }
 #endif
+}
+#endif  // !CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 
 #define AVERAGE_WEIGHT  (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT))
 
 #if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+  /* If the MV points so far into the UMV border that no visible pixels
+   * are used for reconstruction, the subpel part of the MV can be
+   * discarded and the MV limited to 16 pixels with equivalent results.
+   *
+   * This limit kicks in at 19 pixels for the top and left edges, for
+   * the 16 pixels plus 3 taps right of the central pixel when subpel
+   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+   * left of the central pixel when filtering.
+   */
+  if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
+    mv->col = xd->mb_to_left_edge - (16 << 3);
+  else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3))
+    mv->col = xd->mb_to_right_edge + (16 << 3);
+
+  if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
+    mv->row = xd->mb_to_top_edge - (16 << 3);
+  else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3))
+    mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
 // Whether to use implicit weighting for UV
 #define USE_IMPLICIT_WEIGHT_UV
 
@@ -950,9 +996,7 @@ static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd,
                               which_mv ? weight : 0, &xd->subpix);
   }
 }
-#endif
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,
                                                uint8_t *dst_u,
                                                uint8_t *dst_v,
@@ -993,68 +1037,6 @@ static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,
         scale, 8, 8, which_mv ? weight : 0, &xd->subpix);
   }
 }
-
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-#ifdef USE_IMPLICIT_WEIGHT_UV
-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);
-#else
-  int weight = AVERAGE_WEIGHT;
-#endif
-  build_inter16x16_predictors_mbuv_w(xd, dst_u, dst_v, dst_uvstride,
-                                     weight, mb_row, mb_col);
-}
-
-#else
-
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs =
-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
-    uint8_t *uptr, *vptr;
-    int pre_stride = which_mv ? xd->second_pre.uv_stride
-                              : xd->pre.uv_stride;
-    int_mv mv;
-
-    struct scale_factors *scale = &xd->scale_factor_uv[which_mv];
-    mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&mv.as_mv, xd);
-
-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
-
-    scale->set_scaled_offsets(scale, mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor_q4(
-        uptr, pre_stride, dst_u, dst_uvstride, &mv,
-        scale, 8, 8,
-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);
-
-    vp9_build_inter_predictor_q4(
-        vptr, pre_stride, dst_v, dst_uvstride, &mv,
-        scale, 8, 8,
-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);
-  }
-}
-#endif
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 static void build_inter_predictors_sby_w(MACROBLOCKD *x,
                                          uint8_t *dst_y,
                                          int dst_ystride,
@@ -1117,9 +1099,7 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *x,
   build_inter_predictors_sby_w(x, dst_y, dst_ystride, weight,
                                     mb_row, mb_col, bsize);
 }
-#endif
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 static void build_inter_predictors_sbuv_w(MACROBLOCKD *x,
                                           uint8_t *dst_u,
                                           uint8_t *dst_v,
@@ -1199,71 +1179,6 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
                                 weight, mb_row, mb_col, bsize);
 }
 
-#else
-
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *x,
-                                     uint8_t *dst_u,
-                                     uint8_t *dst_v,
-                                     int dst_uvstride,
-                                     int mb_row,
-                                     int mb_col,
-                                     BLOCK_SIZE_TYPE bsize) {
-  const int bwl = mb_width_log2(bsize),  bw = 1 << bwl;
-  const int bhl = mb_height_log2(bsize), bh = 1 << bhl;
-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < bw * bh; n++) {
-    int scaled_uv_offset;
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    x->mb_to_top_edge    = edge[0] -           ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((bh - 1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -           ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((bw - 1 - x_idx) * 16) << 3);
-
-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                            y_idx * 8,
-                                            x->pre.uv_stride,
-                                            &x->scale_factor_uv[0]);
-    x->pre.u_buffer = u1 + scaled_uv_offset;
-    x->pre.v_buffer = v1 + scaled_uv_offset;
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                              y_idx * 8,
-                                              x->second_pre.uv_stride,
-                                              &x->scale_factor_uv[1]);
-      x->second_pre.u_buffer = u2 + scaled_uv_offset;
-      x->second_pre.v_buffer = v2 + scaled_uv_offset;
-    }
-
-    vp9_build_inter16x16_predictors_mbuv(x,
-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_uvstride, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-#endif
-
 void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
                                    int mb_row, int mb_col,
                                    BLOCK_SIZE_TYPE bsize) {
@@ -1286,79 +1201,10 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
   }
 #endif
 }
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,
-                                         int mb_row, int mb_col) {
-  int i;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  BLOCKD *blockd = xd->block;
-  int which_mv = 0;
-  const int use_second_ref = mbmi->second_ref_frame > 0;
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && defined(USE_IMPLICIT_WEIGHT_SPLITMV)
-  int weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);
-#else
-  int weight = AVERAGE_WEIGHT;
-#endif
-
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        if (mbmi->need_to_clamp_mvs) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-        }
-
-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16, which_mv,
-                                  which_mv ? weight : 0,
-                                  &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      }
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16, which_mv,
-                                  which_mv ? weight : 0,
-                                  &xd->subpix,
-                                  mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
-  }
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-#if !defined(USE_IMPLICIT_WEIGHT_UV)
-  weight = AVERAGE_WEIGHT;
-#endif
-#endif
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-    const int x = 4 * (i & 1);
-    const int y = ((i - 16) >> 1) * 4;
-
-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
-                                which_mv ? weight : 0, &xd->subpix,
-                                mb_row * 8 + y, mb_col * 8 + x);
-    }
-  }
-}
+#endif  // CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
 
 static INLINE int round_mv_comp(int value) {
-  return (value < 0 ? value - 4 : value + 4) / 8;
+  return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
 static int mi_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {
@@ -1377,128 +1223,20 @@ static int mi_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {
   return round_mv_comp(temp);
 }
 
-static int b_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {
-  BLOCKD *const blockd = mb->block;
-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 1].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 4].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 5].bmi.as_mv[idx].as_mv.row;
-  return round_mv_comp(temp);
-}
-
-static int b_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {
-  BLOCKD *const blockd = mb->block;
-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 1].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 4].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 5].bmi.as_mv[idx].as_mv.col;
-  return round_mv_comp(temp);
-}
-
-
-static void build_4x4uvmvs(MACROBLOCKD *xd) {
-  int i, j;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      const int yoffset = i * 8 + j * 2;
-      const int uoffset = 16 + i * 2 + j;
-      const int voffset = 20 + i * 2 + j;
-
-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;
-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;
-      u->row = mi_mv_pred_row(xd, yoffset, 0);
-      u->col = mi_mv_pred_col(xd, yoffset, 0);
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(u, xd);
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(u, xd);
-
-      v->row = u->row;
-      v->col = u->col;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;
-        v = &blockd[voffset].bmi.as_mv[1].as_mv;
-        u->row = mi_mv_pred_row(xd, yoffset, 1);
-        u->col = mi_mv_pred_col(xd, yoffset, 1);
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(u, xd);
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(u, xd);
-
-        v->row = u->row;
-        v->col = u->col;
-      }
-    }
-  }
-}
-
 void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
                                    int mb_row,
                                    int mb_col) {
-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
-  } else {
-    build_4x4uvmvs(xd);
-    build_inter4x4_predictors_mb(xd, mb_row, mb_col);
-  }
+  vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
 }
 
+
 /*encoder only*/
 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
                                         int mb_row, int mb_col) {
-  int i, j, weight;
-  BLOCKD *const blockd = xd->block;
-
-  /* build uv mvs */
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      const int yoffset = i * 8 + j * 2;
-      const int uoffset = 16 + i * 2 + j;
-      const int voffset = 20 + i * 2 + j;
-
-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;
-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;
+  uint8_t *const u = xd->dst.u_buffer;
+  uint8_t *const v = xd->dst.v_buffer;
+  const int uv_stride = xd->dst.uv_stride;
 
-      v->row = u->row = b_mv_pred_row(xd, yoffset, 0);
-      v->col = u->col = b_mv_pred_col(xd, yoffset, 0);
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;
-        v = &blockd[voffset].bmi.as_mv[1].as_mv;
-
-        v->row = u->row = b_mv_pred_row(xd, yoffset, 1);
-        v->col = u->col = b_mv_pred_col(xd, yoffset, 1);
-      }
-    }
-  }
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && \
-  defined(USE_IMPLICIT_WEIGHT_SPLITMV) && \
-  defined(USE_IMPLICIT_WEIGHT_UV)
-  weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);
-#else
-  weight = AVERAGE_WEIGHT;
-#endif
-  for (i = 16; i < 24; i += 2) {
-    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-    const int x = 4 * (i & 1);
-    const int y = ((i - 16) >> 1) * 4;
-
-    int which_mv;
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-
-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
-                                which_mv ? weight : 0,
-                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);
-    }
-  }
+  vp9_build_inter_predictors_sbuv(xd, u, v, uv_stride, mb_row, mb_col,
+                                  BLOCK_SIZE_MB16X16);
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 38981e9c1..ee34fc5d2 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -16,28 +16,20 @@
 
 struct subpix_fn_table;
 
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col);
-
-void vp9_build_inter_predictors_sby(MACROBLOCKD *x,
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
                                     uint8_t *dst_y,
                                     int dst_ystride,
                                     int mb_row,
                                     int mb_col,
                                     BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *x,
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
                                      uint8_t *dst_u,
                                      uint8_t *dst_v,
                                      int dst_uvstride,
                                      int mb_row,
                                      int mb_col,
                                      BLOCK_SIZE_TYPE bsize);
-
 void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
                                    int mb_row, int mb_col,
                                    BLOCK_SIZE_TYPE bsize);
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 310f8ed24..2b66834a7 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -278,43 +278,20 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
+  assert(w <= 64);
   assert(h <= 64);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h);
 }
 
 void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
@@ -322,42 +299,20 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
+  assert(w <= 64);
   assert(h <= 64);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
-                                        dst, dst_stride,
-                                        h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h);
 }
 #endif