11 files changed, 291 insertions, 295 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 60ef9c207..0f4d3aa19 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -19,9 +19,9 @@
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_convolve.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_scale.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
 
@@ -171,24 +171,6 @@ enum mv_precision {
   MV_PRECISION_Q4
 };
 
-#define VP9_REF_SCALE_SHIFT 14
-#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
-
-struct scale_factors {
-  int x_scale_fp;   // horizontal fixed point scale factor
-  int y_scale_fp;   // vertical fixed point scale factor
-  int x_offset_q4;
-  int x_step_q4;
-  int y_offset_q4;
-  int y_step_q4;
-
-  int (*scale_value_x)(int val, const struct scale_factors *scale);
-  int (*scale_value_y)(int val, const struct scale_factors *scale);
-  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
-  MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
-
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-};
 
 #if CONFIG_ALPHA
 enum { MAX_MB_PLANE = 4 };
@@ -217,27 +199,6 @@ struct macroblockd_plane {
 
 #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
 
-#define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      2
-
-struct loopfilter {
-  int filter_level;
-
-  int sharpness_level;
-  int last_sharpness_level;
-
-  uint8_t mode_ref_delta_enabled;
-  uint8_t mode_ref_delta_update;
-
-  // 0 = Intra, Last, GF, ARF
-  signed char ref_deltas[MAX_REF_LF_DELTAS];
-  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
-
-  // 0 = ZERO_MV, MV
-  signed char mode_deltas[MAX_MODE_LF_DELTAS];
-  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
-};
-
 typedef struct macroblockd {
   struct macroblockd_plane plane[MAX_MB_PLANE];
 
@@ -252,7 +213,6 @@ typedef struct macroblockd {
   int right_available;
 
   struct segmentation seg;
-  struct loopfilter lf;
 
   // partition contexts
   PARTITION_CONTEXT *above_seg_context;
@@ -450,16 +410,6 @@ static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
   return 4 << (b_height_log2(bsize) - plane->subsampling_y);
 }
 
-static INLINE int plane_block_width_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height_log2by4(
-    BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) {
-  return (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE_TYPE bsize,
                                                   int ss_txfrm_size,
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index df3a9fed5..21e0e0471 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -377,7 +377,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
 
 static void extend_model_to_full_distribution(vp9_prob p,
                                               vp9_prob *tree_probs) {
-  const int l = ((p - 1) / 2);
+  const int l = (p - 1) / 2;
   const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
   if (p & 1) {
     vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
@@ -622,7 +622,6 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
   int t, i, j, k, l;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
   vp9_prob coef_probs[UNCONSTRAINED_NODES];
-  int entropy_nodes_adapt = UNCONSTRAINED_NODES;
 
   for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
@@ -635,7 +634,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                                            0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t)
+          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
             dst_coef_probs[i][j][k][l][t] = merge_probs(
                 pre_coef_probs[i][j][k][l][t], coef_probs[t],
                 branch_ct[t], count_sat, update_factor);
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index c84b9e393..d28218199 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -440,14 +440,12 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
         fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
-      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                       branch_ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
         fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
-      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                       branch_ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
         fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
@@ -475,7 +473,7 @@ static void set_default_lf_deltas(struct loopfilter *lf) {
 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
-  struct loopfilter *const lf = &xd->lf;
+  struct loopfilter *const lf = &cm->lf;
 
   int i;
   vp9_clearall_segfeatures(&xd->seg);
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 66df62753..9609a69ee 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -85,7 +85,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   // 2 when filter_lvl is between 32 and 63
   const int n_shift = default_filt_lvl >> 5;
   loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *const lf = &xd->lf;
+  struct loopfilter *const lf = &cm->lf;
   struct segmentation *const seg = &xd->seg;
 
   // update limits if sharpness has changed
@@ -108,7 +108,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
+      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, sizeof(lfi->lvl[seg_id][0]));
       continue;
     }
 
@@ -377,11 +377,23 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
 }
 
 void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                           int frame_filter_level, int y_only) {
+                           int frame_filter_level,
+                           int y_only, int partial) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
   if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
   vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
   vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
-                       0, cm->mi_rows, y_only);
+                       start_mi_row, end_mi_row,
+                       y_only);
 }
 
 int vp9_loop_filter_worker(void *arg1, void *arg2) {
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 5fc909495..a3f240ef1 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -22,6 +22,27 @@
 
 #define SIMD_WIDTH 16
 
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      2
+
+struct loopfilter {
+  int filter_level;
+
+  int sharpness_level;
+  int last_sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, GF, ARF
+  signed char ref_deltas[MAX_REF_LF_DELTAS];
+  signed char last_ref_deltas[MAX_REF_LF_DELTAS];
+
+  // 0 = ZERO_MV, MV
+  signed char mode_deltas[MAX_MODE_LF_DELTAS];
+  signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+};
+
 // Need to align this structure so when it is declared and
 // passed it can be loaded into vector registers.
 typedef struct {
@@ -51,7 +72,7 @@ void vp9_loop_filter_frame_init(struct VP9Common *const cm,
 void vp9_loop_filter_frame(struct VP9Common *cm,
                            struct macroblockd *mbd,
                            int filter_level,
-                           int y_only);
+                           int y_only, int partial);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 46b729d2c..4391a0a2c 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -176,6 +176,8 @@ typedef struct VP9Common {
 
   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
 
+  struct loopfilter lf;
+
   /* Y,U,V */
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 015ffdce0..cb746c7f3 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -10,140 +10,16 @@
 
 #include <assert.h>
 
+#include "./vpx_scale_rtcd.h"
 #include "./vpx_config.h"
+
 #include "vpx/vpx_integer.h"
+
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "./vpx_scale_rtcd.h"
-
-static int scale_value_x_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
 
-static int scale_value_y_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-}
-
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
-  return val;
-}
-
-static MV32 mv_with_scaling(const MV *mv,
-                               const struct scale_factors *scale) {
-  const MV32 res = {
-    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
-    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
-  };
-  return res;
-}
-
-static MV32 mv_without_scaling(const MV *mv,
-                               const struct scale_factors *scale) {
-  const MV32 res = {
-    mv->row,
-    mv->col
-  };
-  return res;
-}
-
-static void set_offsets_with_scaling(struct scale_factors *scale,
-                                     int row, int col) {
-  const int x_q4 = 16 * col;
-  const int y_q4 = 16 * row;
-
-  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
-                                        int row, int col) {
-  scale->x_offset_q4 = 0;
-  scale->y_offset_q4 = 0;
-}
-
-static int get_fixed_point_scale_factor(int other_size, int this_size) {
-  // Calculate scaling factor once for each reference frame
-  // and use fixed point scaling factors in decoding and encoding routines.
-  // Hardware implementations can calculate scale factor in device driver
-  // and use multiplication and shifting on hardware instead of division.
-  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
-}
-
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h) {
-  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
-  scale->x_offset_q4 = 0;  // calculated per-mb
-  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
-  scale->y_offset_q4 = 0;  // calculated per-mb
-  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
-
-  if ((other_w == this_w) && (other_h == this_h)) {
-    scale->scale_value_x = unscaled_value;
-    scale->scale_value_y = unscaled_value;
-    scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_mv = mv_without_scaling;
-  } else {
-    scale->scale_value_x = scale_value_x_with_scaling;
-    scale->scale_value_y = scale_value_y_with_scaling;
-    scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_mv = mv_with_scaling;
-  }
-
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_avg;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  } else {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
-    } else {
-      // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_avg;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_avg;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_avg;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_avg;
-}
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
@@ -211,20 +87,16 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
   return res;
 }
 
-
-
 // TODO(jkoleszar): yet another mv clamping function :-(
-MV clamp_mv_to_umv_border_sb(const MV *src_mv,
-    int bwl, int bhl, int ss_x, int ss_y,
-    int mb_to_left_edge, int mb_to_top_edge,
-    int mb_to_right_edge, int mb_to_bottom_edge) {
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y) {
   // If the MV points so far into the UMV border that no visible pixels
   // are used for reconstruction, the subpel part of the MV can be
   // discarded and the MV limited to 16 pixels with equivalent results.
-  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
-  const int spel_right = spel_left - (1 << 4);
-  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
-  const int spel_bottom = spel_top - (1 << 4);
+  const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
   MV clamped_mv = {
     src_mv->row << (1 - ss_y),
     src_mv->col << (1 - ss_x)
@@ -232,10 +104,10 @@ MV clamp_mv_to_umv_border_sb(const MV *src_mv,
   assert(ss_x <= 1);
   assert(ss_y <= 1);
 
-  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
-                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
-                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
-                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (xd->mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (xd->mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
 
   return clamped_mv;
 }
@@ -244,39 +116,39 @@ struct build_inter_predictors_args {
   MACROBLOCKD *xd;
   int x;
   int y;
-  uint8_t* dst[MAX_MB_PLANE];
-  int dst_stride[MAX_MB_PLANE];
-  uint8_t* pre[2][MAX_MB_PLANE];
-  int pre_stride[2][MAX_MB_PLANE];
+  struct buf_2d *dst[MAX_MB_PLANE];
+  struct buf_2d *pre[2][MAX_MB_PLANE];
 };
 static void build_inter_predictors(int plane, int block,
                                    BLOCK_SIZE_TYPE bsize,
                                    int pred_w, int pred_h,
                                    void *argv) {
   const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD * const xd = arg->xd;
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
+  MACROBLOCKD *const xd = arg->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+  const int bw = 4 << bwl;
+  const int bh = plane_block_height(bsize, pd);
+  const int x = 4 * (block & ((1 << bwl) - 1));
+  const int y = 4 * (block >> bwl);
   const MODE_INFO *const mi = xd->mode_info_context;
   const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
   int which_mv;
 
-  assert(x < (4 << bwl));
-  assert(y < (4 << bhl));
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == (4 << bwl));
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == (4 << bhl));
+  assert(x < bw);
+  assert(y < bh);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
+  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    // source
-    const uint8_t * const base_pre = arg->pre[which_mv][plane];
-    const int pre_stride = arg->pre_stride[which_mv][plane];
-    const uint8_t *const pre = base_pre +
-        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
-    struct scale_factors * const scale = &xd->scale_factor[which_mv];
+    struct scale_factors *const scale = &xd->scale_factor[which_mv];
+    struct buf_2d *const pre_buf = arg->pre[which_mv][plane];
+    struct buf_2d *const dst_buf = arg->dst[plane];
 
-    // dest
-    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
+    const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
+                               pre_buf->stride, scale);
+
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
 
     // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
     // same MV (the average of the 4 luma MVs) but we could do something
@@ -291,61 +163,40 @@ static void build_inter_predictors(int plane, int block,
     // scaling case. It needs to be done on the scaled MV, not the pre-scaling
     // MV. Note however that it performs the subsampling aware scaling so
     // that the result is always q4.
-    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
-                                                xd->plane[plane].subsampling_x,
-                                                xd->plane[plane].subsampling_y,
-                                                xd->mb_to_left_edge,
-                                                xd->mb_to_top_edge,
-                                                xd->mb_to_right_edge,
-                                                xd->mb_to_bottom_edge);
+    const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                                pd->subsampling_x,
+                                                pd->subsampling_y);
+
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-    vp9_build_inter_predictor(pre, pre_stride,
-                              dst, arg->dst_stride[plane],
-                              &res_mv, &xd->scale_factor[which_mv],
+    vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                              &res_mv, scale,
                               4 << pred_w, 4 << pred_h, which_mv,
                               &xd->subpix, MV_PRECISION_Q4);
   }
 }
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
-                                    int mi_row,
-                                    int mi_col,
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE_TYPE bsize) {
   struct build_inter_predictors_args args = {
     xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
-    {{xd->plane[0].pre[0].buf, NULL, NULL},
-     {xd->plane[0].pre[1].buf, NULL, NULL}},
-    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
+    {&xd->plane[0].dst, NULL, NULL},
+    {{&xd->plane[0].pre[0], NULL, NULL},
+     {&xd->plane[0].pre[1], NULL, NULL}},
   };
 
   foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
 }
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
-                                     int mi_row,
-                                     int mi_col,
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE_TYPE bsize) {
   struct build_inter_predictors_args args = {
     xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
 #if CONFIG_ALPHA
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
-     xd->plane[3].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,
-     xd->plane[3].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,
-      xd->plane[3].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,
-      xd->plane[3].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,
-      xd->plane[3].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,
-      xd->plane[3].pre[1].stride}},
+    {NULL, &xd->plane[1].dst, &xd->plane[2].dst, &xd->plane[3].dst},
+    {{NULL, &xd->plane[1].pre[0], &xd->plane[2].pre[0], &xd->plane[3].pre[0]},
+     {NULL, &xd->plane[1].pre[1], &xd->plane[2].pre[1], &xd->plane[3].pre[1]}},
 #else
-    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
-    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
-    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
-     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
-    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
-     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
+    {NULL, &xd->plane[1].dst, &xd->plane[2].dst},
+    {{NULL, &xd->plane[1].pre[0], &xd->plane[2].pre[0]},
+     {NULL, &xd->plane[1].pre[1], &xd->plane[2].pre[1]}},
 #endif
   };
   foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 6ec7323e1..82c0796dc 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -33,10 +33,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE filter,
                               VP9_COMMON *cm);
 
-void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index f351224a7..6209e1815 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -51,18 +51,17 @@ static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
                                  uint8_t *above_row, uint8_t *left_col) {
   int r, c;
   // first column
-  for (r = 0; r < bs - 1; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1], 1);
-  }
+  for (r = 0; r < bs - 1; ++r)
+    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + left_col[r + 1], 1);
+
   pred_ptr[(bs - 1) * stride] = left_col[bs - 1];
   pred_ptr++;
   // second column
-  for (r = 0; r < bs - 2; ++r) {
-      pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
-                                                   left_col[r + 1] * 2 +
-                                                   left_col[r + 2], 2);
-  }
+  for (r = 0; r < bs - 2; ++r)
+    pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] +
+                                              left_col[r + 1] * 2 +
+                                              left_col[r + 2], 2);
+
   pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] +
                                                       left_col[bs - 1] * 3,
                                                       2);
@@ -70,15 +69,12 @@ static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
   pred_ptr++;
 
   // rest of last row
-  for (c = 0; c < bs - 2; ++c) {
+  for (c = 0; c < bs - 2; ++c)
     pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1];
-  }
 
-  for (r = bs - 2; r >= 0; --r) {
-    for (c = 0; c < bs - 2; ++c) {
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
       pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2];
-    }
-  }
 }
 intra_pred_allsizes(d27)
 
@@ -86,16 +82,12 @@ static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
                                  uint8_t *above_row, uint8_t *left_col) {
   int r, c;
   for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      if (r & 1) {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2 + c + 1] * 2 +
-                                         above_row[r/2 + c + 2], 2);
-      } else {
-        pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] +
-                                         above_row[r/2+ c + 1], 1);
-      }
-    }
+    for (c = 0; c < bs; ++c)
+      pred_ptr[c] = r & 1 ? ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+                                               above_row[r/2 + c + 1] * 2 +
+                                               above_row[r/2 + c + 2], 2)
+                          : ROUND_POWER_OF_TWO(above_row[r/2 + c] +
+                                               above_row[r/2+ c + 1], 1);
     pred_ptr += stride;
   }
 }
@@ -141,9 +133,9 @@ static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs,
                                    left_col[0] * 2 +
                                    left_col[1], 2);
   for (r = 3; r < bs; ++r)
-    pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
-                                                  left_col[r - 2] * 2 +
-                                                  left_col[r - 1], 2);
+    pred_ptr[(r - 2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] +
+                                                    left_col[r - 2] * 2 +
+                                                    left_col[r - 1], 2);
   // the rest of the block
   for (r = 2; r < bs; ++r) {
     for (c = 1; c < bs; c++)
diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c
new file mode 100644
index 000000000..80137e547
--- /dev/null
+++ b/vp9/common/vp9_scale.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_scale.h"
+
+static int scaled_x(int val, const struct scale_factors *scale) {
+  return val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT;
+}
+
+static int scaled_y(int val, const struct scale_factors *scale) {
+  return val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT;
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+  (void) scale;
+  return val;
+}
+
+static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
+  const MV32 res = {
+    (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4,
+    (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4
+  };
+  return res;
+}
+
+static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
+  const MV32 res = {
+    mv->row,
+    mv->col
+  };
+  return res;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+                                     int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
+
+  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xF;
+  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xF;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+                                        int row, int col) {
+  scale->x_offset_q4 = 0;
+  scale->y_offset_q4 = 0;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
+}
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h) {
+  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  scale->x_offset_q4 = 0;  // calculated per-mb
+  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
+
+  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+  scale->y_offset_q4 = 0;  // calculated per-mb
+  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
+
+  if (other_w == this_w && other_h == this_h) {
+    scale->scale_value_x = unscaled_value;
+    scale->scale_value_y = unscaled_value;
+    scale->set_scaled_offsets = set_offsets_without_scaling;
+    scale->scale_mv = unscaled_mv;
+  } else {
+    scale->scale_value_x = scaled_x;
+    scale->scale_value_y = scaled_y;
+    scale->set_scaled_offsets = set_offsets_with_scaling;
+    scale->scale_mv = scaled_mv;
+  }
+
+  // TODO(agrange): Investigate the best choice of functions to use here
+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+  // to do at full-pel offsets. The current selection, where the filter is
+  // applied in one direction only, and not at all for 0,0, seems to give the
+  // best quality, but it may be worth trying an additional mode that does
+  // do the filtering on full-pel.
+  if (scale->x_step_q4 == 16) {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in either direction.
+      scale->predict[0][0][0] = vp9_convolve_copy;
+      scale->predict[0][0][1] = vp9_convolve_avg;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      scale->predict[0][0][0] = vp9_convolve8_vert;
+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  } else {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      scale->predict[0][0][0] = vp9_convolve8_horiz;
+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      scale->predict[0][0][0] = vp9_convolve8;
+      scale->predict[0][0][1] = vp9_convolve8_avg;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  scale->predict[1][1][0] = vp9_convolve8;
+  scale->predict[1][1][1] = vp9_convolve8_avg;
+}
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
new file mode 100644
index 000000000..0414dde5e
--- /dev/null
+++ b/vp9/common/vp9_scale.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCALE_H_
+#define VP9_COMMON_VP9_SCALE_H_
+
+#include "vp9/common/vp9_mv.h"
+#include "vp9/common/vp9_convolve.h"
+
+#define VP9_REF_SCALE_SHIFT 14
+#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT)
+
+struct scale_factors {
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
+  int x_offset_q4;
+  int x_step_q4;
+  int y_offset_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *scale);
+  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
+  MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
+
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+};
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       int other_w, int other_h,
+                                       int this_w, int this_h);
+
+#endif  //  VP9_COMMON_VP9_SCALE_H_