32 files changed, 1429 insertions, 1433 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d1da5fe0c..2f60e38fa 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -288,6 +288,18 @@ typedef struct superblockd {
   DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);
 } SUPERBLOCKD;
 
+struct scale_factors {
+  int x_num;
+  int x_den;
+  int x_offset_q4;
+  int x_step_q4;
+  int y_num;
+  int y_den;
+  int y_offset_q4;
+  int y_step_q4;
+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
+};
+
 typedef struct macroblockd {
   DECLARE_ALIGNED(16, int16_t,  diff[384]);      /* from idct diff */
   DECLARE_ALIGNED(16, uint8_t,  predictor[384]);
@@ -303,6 +315,8 @@ typedef struct macroblockd {
   YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
   YV12_BUFFER_CONFIG second_pre;
   YV12_BUFFER_CONFIG dst;
+  struct scale_factors scale_factor[2];
+  struct scale_factors scale_factor_uv[2];
 
   MODE_INFO *prev_mode_info_context;
   MODE_INFO *mode_info_context;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index b87c410df..b062e7dc7 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -19,7 +19,6 @@
 
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT  7
-#define ALIGN_FILTERS_256 0
 
 /* Assume a bank of 16 filters to choose from. There are two implementations
  * for filter wrapping behavior, since we want to be able to pick which filter
@@ -34,8 +33,11 @@
  *    always 256 byte aligned.
  *
  * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial.
+ * parameters, and switching between them is trivial, with the
+ * ALIGN_FILTERS_256 macro, below.
  */
+ #define ALIGN_FILTERS_256 1
+
 static void convolve_horiz_c(const uint8_t *src, int src_stride,
                              uint8_t *dst, int dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
@@ -56,11 +58,12 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
     const int16_t *filter_x = filter_x0;
 
     /* Initial phase offset */
-    int x_q4 = (filter_x - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = x_q4 >> 4;
+      int src_x = (x_q4 - x0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[src_x + k] * filter_x[k];
@@ -97,11 +100,12 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
     const int16_t *filter_x = filter_x0;
 
     /* Initial phase offset */
-    int x_q4 = (filter_x - filter_x_base) / taps;
+    int x0_q4 = (filter_x - filter_x_base) / taps;
+    int x_q4 = x0_q4;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = x_q4 >> 4;
+      int src_x = (x_q4 - x0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[src_x + k] * filter_x[k];
@@ -138,11 +142,12 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
     const int16_t *filter_y = filter_y0;
 
     /* Initial phase offset */
-    int y_q4 = (filter_y - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = y_q4 >> 4;
+      int src_y = (y_q4 - y0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[(src_y + k) * src_stride] * filter_y[k];
@@ -179,11 +184,12 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
     const int16_t *filter_y = filter_y0;
 
     /* Initial phase offset */
-    int y_q4 = (filter_y - filter_y_base) / taps;
+    int y0_q4 = (filter_y - filter_y_base) / taps;
+    int y_q4 = y0_q4;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = y_q4 >> 4;
+      int src_y = (y_q4 - y0_q4) >> 4;
 
       for (sum = 0, k = 0; k < taps; ++k) {
         sum += src[(src_y + k) * src_stride] * filter_y[k];
@@ -206,16 +212,25 @@ static void convolve_c(const uint8_t *src, int src_stride,
                        const int16_t *filter_x, int x_step_q4,
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  uint8_t temp[16 * 23];
+  /* Fixed size intermediate buffer places limits on parameters.
+   * Maximum intermediate_height is 39, for y_step_q4 == 32,
+   * h == 16, taps == 8.
+   */
+  uint8_t temp[16 * 39];
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+
   assert(w <= 16);
   assert(h <= 16);
   assert(taps <= 8);
+  assert(y_step_q4 <= 32);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
 
   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
                    temp, 16,
                    filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, h + taps - 1, taps);
+                   w, intermediate_height, taps);
   convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
                   filter_x, x_step_q4, filter_y, y_step_q4,
                   w, h, taps);
@@ -226,16 +241,25 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  uint8_t temp[16 * 23];
+  /* Fixed size intermediate buffer places limits on parameters.
+   * Maximum intermediate_height is 39, for y_step_q4 == 32,
+   * h == 16, taps == 8.
+   */
+  uint8_t temp[16 * 39];
+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
+
   assert(w <= 16);
   assert(h <= 16);
   assert(taps <= 8);
+  assert(y_step_q4 <= 32);
+
+  if (intermediate_height < h)
+    intermediate_height = h;
 
   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
                    temp, 16,
                    filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, h + taps - 1, taps);
+                   w, intermediate_height, taps);
   convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
                       filter_x, x_step_q4, filter_y, y_step_q4,
                       w, h, taps);
@@ -318,25 +342,17 @@ void vp9_convolve_copy(const uint8_t *src, int src_stride,
                        const int16_t *filter_x, int filter_x_stride,
                        const int16_t *filter_y, int filter_y_stride,
                        int w, int h) {
-  if (h == 16) {
+  if (w == 16 && h == 16) {
     vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
-  } else if (h == 8) {
+  } else if (w == 8 && h == 8) {
     vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
-  } else if (w == 8) {
+  } else if (w == 8 && h == 4) {
     vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
   } else {
-    // 4x4
     int r;
 
-    for (r = 0; r < 4; ++r) {
-#if !(CONFIG_FAST_UNALIGNED)
-      dst[0]  = src[0];
-      dst[1]  = src[1];
-      dst[2]  = src[2];
-      dst[3]  = src[3];
-#else
-      *(uint32_t *)dst = *(const uint32_t *)src;
-#endif
+    for (r = h; r > 0; --r) {
+      memcpy(dst, src, w);
       src += src_stride;
       dst += dst_stride;
     }
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index 46c935ab7..8c4856187 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -33,11 +33,8 @@ void vp9_convolve_avg(const uint8_t *src, int src_stride,
                       int w, int h);
 
 struct subpix_fn_table {
-  convolve_fn_t predict[2][2][2];  // horiz, vert, avg
   const int16_t (*filter_x)[8];
   const int16_t (*filter_y)[8];
-  int x_step_q4;
-  int y_step_q4;
 };
 
 #endif  // VP9_COMMON_CONVOLVE_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 5e425895f..434c63e7e 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -15,7 +15,7 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
   { 0, 0, 0, 120,   8, 0, 0, 0 },
   { 0, 0, 0, 112,  16, 0, 0, 0 },
@@ -36,7 +36,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
 
 #define FILTER_ALPHA       0
 #define FILTER_ALPHA_SHARP 1
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8])
+    = {
 #if FILTER_ALPHA == 0
   /* Lagrangian interpolation filter */
   { 0,   0,   0, 128,   0,   0,   0,  0},
@@ -55,6 +56,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
   { -1,   3,  -9,  27, 118, -13,   4, -1},
   { 0,   2,  -6,  18, 122, -10,   3, -1},
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
+
 #elif FILTER_ALPHA == 50
   /* Generated using MATLAB:
    * alpha = 0.5;
@@ -82,7 +84,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
 #endif  /* FILTER_ALPHA */
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
+    = {
 #if FILTER_ALPHA_SHARP == 1
   /* dct based filter */
   {0,   0,   0, 128,   0,   0,   0, 0},
@@ -101,6 +104,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
   {-2,   5, -10,  27, 121, -17,   7, -3},
   {-1,   3,  -6,  17, 125, -13,   5, -2},
   {0,   1,  -3,   8, 127,  -7,   3, -1}
+
 #elif FILTER_ALPHA_SHARP == 75
   /* alpha = 0.75 */
   {0,   0,   0, 128,   0,   0,   0, 0},
@@ -122,7 +126,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
 #endif  /* FILTER_ALPHA_SHARP */
 };
 
-DECLARE_ALIGNED(16, const int16_t,
+DECLARE_ALIGNED(256, const int16_t,
                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
   /* 8-tap lowpass filter */
   /* Hamming window */
@@ -144,7 +148,8 @@ DECLARE_ALIGNED(16, const int16_t,
   { 1, -2, -7, 37, 80, 28, -8, -1}
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8])
+    = {
   {0, 0,   0, 128,   0,   0, 0,  0},
   {0, 1,  -5, 125,   8,  -2, 1,  0},
   {0, 1,  -8, 122,  17,  -5, 1,  0},
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index d25d0ac2a..3e0ee4b63 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -68,4 +68,11 @@ static INLINE int dct_const_round_shift(int input) {
   assert(INT16_MIN <= rv && rv <= INT16_MAX);
   return rv;
 }
+
+static INLINE int dct_32_round(int input) {
+  int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+  assert(-131072 <= rv && rv <= 131071);
+  return rv;
+}
+
 #endif
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 19397028b..f34823b36 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -115,7 +115,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
   }
 }
 
-void idct4_1d(int16_t *input, int16_t *output) {
+static void idct4_1d(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -193,7 +193,7 @@ void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
   }
 }
 
-void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -313,10 +313,9 @@ static const transform_2d IHT_4[] = {
 
 void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[4], temp_out[4];
   const transform_2d ht = IHT_4[tx_type];
 
@@ -333,7 +332,7 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 4 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
 }
 
@@ -423,10 +422,9 @@ static const transform_2d IHT_8[] = {
 
 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[8], temp_out[8];
   const transform_2d ht = IHT_8[tx_type];
 
@@ -443,7 +441,7 @@ void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
   }
 }
 
@@ -479,7 +477,7 @@ void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 5);
 }
 
-void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -846,18 +844,17 @@ static const transform_2d IHT_16[] = {
 };
 
 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+                          int input_pitch, TX_TYPE tx_type) {
+  int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
-  int i, j;
   int16_t temp_in[16], temp_out[16];
   const transform_2d ht = IHT_16[tx_type];
 
   // Rows
   for (i = 0; i < 16; ++i) {
     ht.rows(input, outptr);
-    input += half_pitch;
+    input += input_pitch;
     outptr += 16;
   }
 
@@ -905,7 +902,7 @@ void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index c6b961894..d431ea24b 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -25,8 +25,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
   for (i = 0; i < 16; i++) {
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff,
-                       32, tx_type);
+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_4x4(xd, i, 32);
     }
@@ -58,8 +57,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   for (i = 0; i < 9; i += 8) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff,
-                           32, tx_type);
+      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -69,7 +67,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
     TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
       vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
-                           32, tx_type);
+                           16, tx_type);
     } else {
       vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                   &blockd[i].diff[0], 32);
@@ -101,7 +99,7 @@ void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
   BLOCKD *bd = &xd->block[0];
   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);
   if (tx_type != DCT_DCT) {
-    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 32, tx_type);
+    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);
   } else {
     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                   &xd->block[0].diff[0], 32);
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 8acd4046b..a1eef4649 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -23,4 +23,14 @@ typedef union int_mv {
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
+struct mv32 {
+  int32_t row;
+  int32_t col;
+};
+
+typedef union int_mv32 {
+  uint64_t    as_int;
+  struct mv32 as_mv;
+} int_mv32; /* facilitates faster equality tests and copies */
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index e952fe933..c4bb12340 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -39,7 +39,11 @@ void vp9_initialize_common(void);
 
 #define NUM_REF_FRAMES 3
 #define NUM_REF_FRAMES_LG2 2
-#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 1)
+
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
 
 #define NUM_FRAME_CONTEXTS_LG2 2
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)
@@ -128,6 +132,8 @@ typedef struct VP9Common {
 
   int Width;
   int Height;
+  int last_width;
+  int last_height;
   int horiz_scale;
   int vert_scale;
 
@@ -145,6 +151,7 @@ typedef struct VP9Common {
    */
   int active_ref_idx[3]; /* each frame can reference 3 buffers */
   int new_fb_idx;
+  struct scale_factors active_ref_scale[3];
 
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG temp_scale_frame;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b75525e2c..30e8951af 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -17,26 +17,97 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
-                              VP9_COMMON *cm) {
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       YV12_BUFFER_CONFIG *other,
+                                       int this_w, int this_h) {
+  int other_w, other_h;
+
+  other_h = other->y_height;
+  other_w = other->y_width;
+  scale->x_num = other_w;
+  scale->x_den = this_w;
+  scale->x_offset_q4 = 0;  // calculated per-mb
+  scale->x_step_q4 = 16 * other_w / this_w;
+  scale->y_num = other_h;
+  scale->y_den = this_h;
+  scale->y_offset_q4 = 0;  // calculated per-mb
+  scale->y_step_q4 = 16 * other_h / this_h;
+
   // TODO(agrange): Investigate the best choice of functions to use here
   // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
   // to do at full-pel offsets. The current selection, where the filter is
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-  xd->subpix.predict[0][0][0] = vp9_convolve_copy;
-  xd->subpix.predict[0][0][1] = vp9_convolve_avg;
-  xd->subpix.predict[0][1][0] = vp9_convolve8_vert;
-  xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert;
-  xd->subpix.predict[1][0][0] = vp9_convolve8_horiz;
-  xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz;
-  xd->subpix.predict[1][1][0] = vp9_convolve8;
-  xd->subpix.predict[1][1][1] = vp9_convolve8_avg;
-
-  xd->subpix.x_step_q4 = 16;
-  xd->subpix.y_step_q4 = 16;
+  if (scale->x_step_q4 == 16) {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in either direction.
+      scale->predict[0][0][0] = vp9_convolve_copy;
+      scale->predict[0][0][1] = vp9_convolve_avg;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // No scaling in x direction. Must always scale in the y direction.
+      scale->predict[0][0][0] = vp9_convolve8_vert;
+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;
+      scale->predict[0][1][0] = vp9_convolve8_vert;
+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  } else {
+    if (scale->y_step_q4 == 16) {
+      // No scaling in the y direction. Must always scale in the x direction.
+      scale->predict[0][0][0] = vp9_convolve8_horiz;
+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8_horiz;
+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+    } else {
+      // Must always scale in both directions.
+      scale->predict[0][0][0] = vp9_convolve8;
+      scale->predict[0][0][1] = vp9_convolve8_avg;
+      scale->predict[0][1][0] = vp9_convolve8;
+      scale->predict[0][1][1] = vp9_convolve8_avg;
+      scale->predict[1][0][0] = vp9_convolve8;
+      scale->predict[1][0][1] = vp9_convolve8_avg;
+    }
+  }
+  // 2D subpel motion always gets filtered in both directions
+  scale->predict[1][1][0] = vp9_convolve8;
+  scale->predict[1][1][1] = vp9_convolve8_avg;
+}
+
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATIONFILTERTYPE mcomp_filter_type,
+                              VP9_COMMON *cm) {
+  int i;
+
+  /* Calculate scaling factors for each of the 3 available references */
+  for (i = 0; i < 3; ++i) {
+    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
+      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
+      continue;
+    }
+
+    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
+                                      &cm->yv12_fb[cm->active_ref_idx[i]],
+                                      cm->mb_cols * 16, cm->mb_rows * 16);
+  }
+
+  if (xd->mode_info_context) {
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+
+    set_scale_factors(xd,
+                      mbmi->ref_frame - 1,
+                      mbmi->second_ref_frame - 1,
+                      cm->active_ref_scale);
+  }
+
+
   switch (mcomp_filter_type) {
     case EIGHTTAP:
     case SWITCHABLE:
@@ -57,6 +128,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
       break;
 #endif
   }
+  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
 void vp9_copy_mem16x16_c(const uint8_t *src,
@@ -146,113 +218,151 @@ void vp9_copy_mem8x4_c(const uint8_t *src,
   }
 }
 
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                  struct subpix_fn_table *subpix) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv[0].as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
-      ptr, d->pre_stride, pred_ptr, pitch,
-      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
-      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
-      4, 4);
+static void set_scaled_offsets(struct scale_factors *scale,
+                               int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
+
+  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
+  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
 }
 
-/*
- * Similar to vp9_build_inter_predictors_b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to vp9_build_inter_predictors_b()) with the
- * predictor of the second reference frame / motion vector.
- */
-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                      struct subpix_fn_table *subpix) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv[1].as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
-      ptr, d->pre_stride, pred_ptr, pitch,
-      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
-      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
-      4, 4);
+static int32_t scale_motion_vector_component_q3(int mv_q3,
+                                                int num,
+                                                int den,
+                                                int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+  const int32_t mv_q4 = mv_q3 << 1;
+
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  return mv_q4 * num / den + offset_q4;
 }
 
-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv[0].as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
-      ptr, d->pre_stride, pred_ptr, pitch,
-      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
-      8, 8);
+static int32_t scale_motion_vector_component_q4(int mv_q4,
+                                                int num,
+                                                int den,
+                                                int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  return mv_q4 * num / den + offset_q4;
 }
 
-/*
- * Similar to build_inter_predictors_4b(), but instead of storing the
- * results in d->predictor, we average the contents of d->predictor (which
- * come from an earlier call to build_inter_predictors_4b()) with the
- * predictor of the second reference frame / motion vector.
+static int_mv32 scale_motion_vector_q3_to_q4(
+    const int_mv *src_mv,
+    const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+
+  result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row,
+                                                      scale->y_num,
+                                                      scale->y_den,
+                                                      scale->y_offset_q4);
+  result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col,
+                                                      scale->x_num,
+                                                      scale->x_den,
+                                                      scale->x_offset_q4);
+  return result;
+}
+
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int_mv *mv_q3,
+                               const struct scale_factors *scale,
+                               int w, int h, int do_avg,
+                               const struct subpix_fn_table *subpix) {
+  int_mv32 mv;
+
+  mv = scale_motion_vector_q3_to_q4(mv_q3, scale);
+  src = src + (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
+
+  scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][do_avg](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,
+      subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,
+      w, h);
+}
+
+/* Like vp9_build_inter_predictor, but takes the full-pel part of the
+ * mv separately, and the fractional part as a q4.
  */
-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                      BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_second_pre);
-  mv.as_int = d->bmi.as_mv[1].as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
-      ptr, d->pre_stride, pred_ptr, pitch,
-      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
-      8, 8);
+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride,
+                                  const int_mv *fullpel_mv_q3,
+                                  const int_mv *frac_mv_q4,
+                                  const struct scale_factors *scale,
+                                  int w, int h, int do_avg,
+                                  const struct subpix_fn_table *subpix) {
+  const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4)
+                        + (frac_mv_q4->as_mv.row & 0xf);
+  const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4)
+                        + (frac_mv_q4->as_mv.col & 0xf);
+  const int scaled_mv_row_q4 =
+      scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den,
+                                       scale->y_offset_q4);
+  const int scaled_mv_col_q4 =
+      scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den,
+                                       scale->x_offset_q4);
+  const int subpel_x = scaled_mv_col_q4 & 15;
+  const int subpel_y = scaled_mv_row_q4 & 15;
+
+  src = src + (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);
+  scale->predict[!!subpel_x][!!subpel_y][do_avg](
+      src, src_stride, dst, dst_stride,
+      subpix->filter_x[subpel_x], scale->x_step_q4,
+      subpix->filter_y[subpel_y], scale->y_step_q4,
+      w, h);
 }
 
-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  uint8_t *ptr_base;
-  uint8_t *ptr;
-  uint8_t *pred_ptr = d->predictor;
-  int_mv mv;
-
-  ptr_base = *(d->base_pre);
-  mv.as_int = d->bmi.as_mv[0].as_int;
-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
-        (mv.as_mv.col >> 3);
-
-  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
-      ptr, d->pre_stride, pred_ptr, pitch,
-      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
-      8, 4);
+static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,
+                                      struct scale_factors *scale,
+                                      int block_size, int stride, int which_mv,
+                                      const struct subpix_fn_table *subpix,
+                                      int row, int col) {
+  assert(d1->predictor - d0->predictor == block_size);
+  assert(d1->pre == d0->pre + block_size);
+
+  set_scaled_offsets(&scale[which_mv], row, col);
+
+  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
+    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
+
+    vp9_build_inter_predictor(*base_pre + d0->pre,
+                              d0->pre_stride,
+                              d0->predictor, stride,
+                              &d0->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              2 * block_size, block_size, which_mv,
+                              subpix);
+
+  } else {
+    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
+    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
+
+    vp9_build_inter_predictor(*base_pre0 + d0->pre,
+                              d0->pre_stride,
+                              d0->predictor, stride,
+                              &d0->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              block_size, block_size, which_mv,
+                              subpix);
+
+    set_scaled_offsets(&scale[which_mv], row, col + block_size);
+
+    vp9_build_inter_predictor(*base_pre1 + d1->pre,
+                              d1->pre_stride,
+                              d1->predictor, stride,
+                              &d1->bmi.as_mv[which_mv],
+                              &scale[which_mv],
+                              block_size, block_size, which_mv,
+                              subpix);
+  }
 }
 
 /*encoder only*/
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
+                                        int mb_row,
+                                        int mb_col) {
   int i, j;
   BLOCKD *blockd = xd->block;
 
@@ -329,19 +439,17 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
   }
 
   for (i = 16; i < 24; i += 2) {
+    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+    const int x = 4 * (i & 1);
+    const int y = ((i - 16) >> 1) * 4;
+
+    int which_mv;
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
 
-    if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
-      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
-    }
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
-      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
+                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);
     }
   }
 }
@@ -383,91 +491,100 @@ static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
 }
 
 /*encoder only*/
-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             uint8_t *dst_y,
-                                             int dst_ystride,
-                                             int clamp_mvs) {
-  uint8_t *ptr_base = xd->pre.y_buffer;
-  uint8_t *ptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv ymv;
-
-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (clamp_mvs)
-    clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
-
-  xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0](
-      ptr, pre_stride, dst_y, dst_ystride,
-      xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
-      16, 16);
-}
-
-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              uint8_t *dst_u,
-                                              uint8_t *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  uint8_t *uptr, *vptr;
-  int pre_stride = xd->block[0].pre_stride;
-  int_mv _o16x16mv;
-  int_mv _16x16mv;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  _o16x16mv = _16x16mv;
-  /* calc uv motion vectors */
-  if (_16x16mv.as_mv.row < 0)
-    _16x16mv.as_mv.row -= 1;
-  else
-    _16x16mv.as_mv.row += 1;
-
-  if (_16x16mv.as_mv.col < 0)
-    _16x16mv.as_mv.col -= 1;
-  else
-    _16x16mv.as_mv.col += 1;
-
-  _16x16mv.as_mv.row /= 2;
-  _16x16mv.as_mv.col /= 2;
-
-  _16x16mv.as_mv.row &= xd->fullpixel_mask;
-  _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
-  uptr = xd->pre.u_buffer + offset;
-  vptr = xd->pre.v_buffer + offset;
-
-  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
-                    [!!(_o16x16mv.as_mv.row & 15)][0](
-      uptr, pre_stride, dst_u, dst_uvstride,
-      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
-      8, 8);
-
-  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
-                    [!!(_o16x16mv.as_mv.row & 15)][0](
-      vptr, pre_stride, dst_v, dst_uvstride,
-      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
-      8, 8);
+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                         uint8_t *dst_y,
+                                         int dst_ystride,
+                                         int mb_row,
+                                         int mb_col) {
+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+  int which_mv;
+
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    const int clamp_mvs =
+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    uint8_t *base_pre;
+    int_mv ymv;
+    int pre_stride;
+
+    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
+    base_pre = which_mv ? xd->second_pre.y_buffer
+                        : xd->pre.y_buffer;
+    pre_stride = which_mv ? xd->second_pre.y_stride
+                          : xd->pre.y_stride;
+    if (clamp_mvs)
+      clamp_mv_to_umv_border(&ymv.as_mv, xd);
+
+    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);
+
+    vp9_build_inter_predictor(base_pre, pre_stride,
+                              dst_y, dst_ystride,
+                              &ymv, &xd->scale_factor[which_mv],
+                              16, 16, which_mv, &xd->subpix);
+  }
 }
 
-
-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            uint8_t *dst_y,
-                                            uint8_t *dst_u,
-                                            uint8_t *dst_v,
-                                            int dst_ystride, int dst_uvstride) {
-  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
-      xd->mode_info_context->mbmi.need_to_clamp_mvs);
-  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                          uint8_t *dst_u,
+                                          uint8_t *dst_v,
+                                          int dst_uvstride,
+                                          int mb_row,
+                                          int mb_col) {
+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
+  int which_mv;
+
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    const int clamp_mvs =
+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
+    uint8_t *uptr, *vptr;
+    int pre_stride = which_mv ? xd->second_pre.y_stride
+                              : xd->pre.y_stride;
+    int_mv _o16x16mv;
+    int_mv _16x16mv;
+
+    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
+
+    if (clamp_mvs)
+      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
+
+    _o16x16mv = _16x16mv;
+    /* calc uv motion vectors */
+    if (_16x16mv.as_mv.row < 0)
+      _16x16mv.as_mv.row -= 1;
+    else
+      _16x16mv.as_mv.row += 1;
+
+    if (_16x16mv.as_mv.col < 0)
+      _16x16mv.as_mv.col -= 1;
+    else
+      _16x16mv.as_mv.col += 1;
+
+    _16x16mv.as_mv.row /= 2;
+    _16x16mv.as_mv.col /= 2;
+
+    _16x16mv.as_mv.row &= xd->fullpixel_mask;
+    _16x16mv.as_mv.col &= xd->fullpixel_mask;
+
+    pre_stride >>= 1;
+    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
+    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
+
+    set_scaled_offsets(&xd->scale_factor_uv[which_mv],
+                       mb_row * 16, mb_col * 16);
+
+    vp9_build_inter_predictor_q4(uptr, pre_stride,
+                                 dst_u, dst_uvstride,
+                                 &_16x16mv, &_o16x16mv,
+                                 &xd->scale_factor_uv[which_mv],
+                                 8, 8, which_mv, &xd->subpix);
+
+    vp9_build_inter_predictor_q4(vptr, pre_stride,
+                                 dst_v, dst_uvstride,
+                                 &_16x16mv, &_o16x16mv,
+                                 &xd->scale_factor_uv[which_mv],
+                                 8, 8, which_mv, &xd->subpix);
+  }
 }
 
 void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
@@ -475,7 +592,9 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
                                         uint8_t *dst_u,
                                         uint8_t *dst_v,
                                         int dst_ystride,
-                                        int dst_uvstride) {
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
   uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
   uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
           *v2 = x->second_pre.v_buffer;
@@ -488,32 +607,43 @@ void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    int scaled_uv_offset;
 
     x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
     x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
 
-    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,
+                                                y_idx * 16,
+                                                x->pre.y_stride,
+                                                &x->scale_factor[0]);
+    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
+                                            y_idx * 8,
+                                            x->pre.uv_stride,
+                                            &x->scale_factor_uv[0]);
+    x->pre.u_buffer = u1 + scaled_uv_offset;
+    x->pre.v_buffer = v1 + scaled_uv_offset;
 
-    vp9_build_1st_inter16x16_predictors_mb(x,
-      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-      dst_ystride, dst_uvstride);
     if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
-      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
-      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.y_buffer = y2 +
+          scaled_buffer_offset(x_idx * 16,
+                               y_idx * 16,
+                               x->second_pre.y_stride,
+                               &x->scale_factor[1]);
+      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
+                                              y_idx * 8,
+                                              x->second_pre.uv_stride,
+                                              &x->scale_factor_uv[1]);
+      x->second_pre.u_buffer = u2 + scaled_uv_offset;
+      x->second_pre.v_buffer = v2 + scaled_uv_offset;
+    }
 
-      vp9_build_2nd_inter16x16_predictors_mb(x,
+    vp9_build_inter16x16_predictors_mb(x,
         dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
         dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
         dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_ystride, dst_uvstride);
-    }
+        dst_ystride, dst_uvstride, mb_row + y_idx, mb_col + x_idx);
   }
 
   x->mb_to_top_edge    = edge[0];
@@ -544,7 +674,9 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
                                         uint8_t *dst_u,
                                         uint8_t *dst_v,
                                         int dst_ystride,
-                                        int dst_uvstride) {
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
   uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
   uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
           *v2 = x->second_pre.v_buffer;
@@ -557,27 +689,43 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 
   for (n = 0; n < 4; n++) {
     const int x_idx = n & 1, y_idx = n >> 1;
+    int scaled_uv_offset;
 
     x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
     x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
 
-    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
-    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
-    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,
+                                                y_idx * 32,
+                                                x->pre.y_stride,
+                                                &x->scale_factor[0]);
+    scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
+                                            y_idx * 16,
+                                            x->pre.uv_stride,
+                                            &x->scale_factor_uv[0]);
+    x->pre.u_buffer = u1 + scaled_uv_offset;
+    x->pre.v_buffer = v1 + scaled_uv_offset;
 
     if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
-      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
-      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+      x->second_pre.y_buffer = y2 +
+          scaled_buffer_offset(x_idx * 32,
+                               y_idx * 32,
+                               x->second_pre.y_stride,
+                               &x->scale_factor[1]);
+      scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
+                                              y_idx * 16,
+                                              x->second_pre.uv_stride,
+                                              &x->scale_factor_uv[1]);
+      x->second_pre.u_buffer = u2 + scaled_uv_offset;
+      x->second_pre.v_buffer = v2 + scaled_uv_offset;
     }
 
     vp9_build_inter32x32_predictors_sb(x,
         dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
         dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
         dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
-        dst_ystride, dst_uvstride);
+        dst_ystride, dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2);
   }
 
   x->mb_to_top_edge    = edge[0];
@@ -603,171 +751,48 @@ void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
 #endif
 }
 
-/*
- * The following functions should be called after an initial
- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second filter on a (different) ref
- * frame and average the result with the output of the
- * first filter. The second reference frame is stored
- * in x->second_pre (the reference frame index is in
- * x->mode_info_context->mbmi.second_ref_frame). The second
- * motion vector is x->mode_info_context->mbmi.second_mv.
- *
- * This allows blending prediction from two reference frames
- * which sometimes leads to better prediction than from a
- * single reference framer.
- */
-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             uint8_t *dst_y,
-                                             int dst_ystride) {
-  uint8_t *ptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-
-  uint8_t *ptr_base = xd->second_pre.y_buffer;
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
-
-  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1](
-      ptr, pre_stride, dst_y, dst_ystride,
-      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
-      16, 16);
-}
-
-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              uint8_t *dst_u,
-                                              uint8_t *dst_v,
-                                              int dst_uvstride) {
-  int offset;
-  uint8_t *uptr, *vptr;
-
-  int_mv _16x16mv;
-  int mv_row;
-  int mv_col;
-  int omv_row, omv_col;
-
-  int pre_stride = xd->block[0].pre_stride;
-
-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-
-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)
-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-  mv_row = _16x16mv.as_mv.row;
-  mv_col = _16x16mv.as_mv.col;
-
-  /* calc uv motion vectors */
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row = (mv_row + (mv_row > 0)) >> 1;
-  mv_col = (mv_col + (mv_col > 0)) >> 1;
-
-  mv_row &= xd->fullpixel_mask;
-  mv_col &= xd->fullpixel_mask;
-
-  pre_stride >>= 1;
-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-  uptr = xd->second_pre.u_buffer + offset;
-  vptr = xd->second_pre.v_buffer + offset;
-
-  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
-      uptr, pre_stride, dst_u, dst_uvstride,
-      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
-      8, 8);
-
-  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
-      vptr, pre_stride, dst_v, dst_uvstride,
-      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
-      8, 8);
-}
-
-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            uint8_t *dst_y,
-                                            uint8_t *dst_u,
-                                            uint8_t *dst_v,
-                                            int dst_ystride,
-                                            int dst_uvstride) {
-  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
-  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
-}
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,
+                                         int mb_row, int mb_col) {
   int i;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
   BLOCKD *blockd = xd->block;
+  int which_mv = 0;
+  const int use_second_ref = mbmi->second_ref_frame > 0;
 
   if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];
-    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];
-    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];
-    blockd[10].bmi = xd->mode_info_context->bmi[10];
-
-    if (mbmi->need_to_clamp_mvs) {
-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[0].as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[0].as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[0].as_mv, xd);
-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[0].as_mv, xd);
-      if (mbmi->second_ref_frame > 0) {
-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[1].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[1].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[1].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[1].as_mv, xd);
-      }
-    }
+    for (i = 0; i < 16; i += 8) {
+      BLOCKD *d0 = &blockd[i];
+      BLOCKD *d1 = &blockd[i + 2];
+      const int y = i & 8;
 
+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
+      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
 
-    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);
-    vp9_build_inter_predictors4b(xd, &blockd[10], 16);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        if (mbmi->need_to_clamp_mvs) {
+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
+          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
+        }
 
-    if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);
-      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);
+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16,
+                                  which_mv, &xd->subpix,
+                                  mb_row * 16 + y, mb_col * 16);
+      }
     }
   } else {
     for (i = 0; i < 16; i += 2) {
       BLOCKD *d0 = &blockd[i];
       BLOCKD *d1 = &blockd[i + 1];
+      const int x = (i & 3) * 4;
+      const int y = (i >> 2) * 4;
 
       blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
       blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
 
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[0].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[0].as_mv, xd);
-        if (mbmi->second_ref_frame > 0) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[1].as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[1].as_mv, xd);
-        }
-      }
-
-      if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
-        build_inter_predictors2b(xd, d0, 16);
-      else {
-        vp9_build_inter_predictors_b(d0, 16, &xd->subpix);
-        vp9_build_inter_predictors_b(d1, 16, &xd->subpix);
-      }
-
-      if (mbmi->second_ref_frame > 0) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix);
-        vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16,
+                                  which_mv, &xd->subpix,
+                                  mb_row * 16 + y, mb_col * 16 + x);
       }
     }
   }
@@ -775,17 +800,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
   for (i = 16; i < 24; i += 2) {
     BLOCKD *d0 = &blockd[i];
     BLOCKD *d1 = &blockd[i + 1];
+    const int x = 4 * (i & 1);
+    const int y = ((i - 16) >> 1) * 4;
 
-    if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int)
-      build_inter_predictors2b(xd, d0, 8);
-    else {
-      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
-      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
-      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8,
+                                which_mv, &xd->subpix,
+                                mb_row * 8 + y, mb_col * 8 + x);
     }
   }
 }
@@ -882,22 +903,31 @@ void build_4x4uvmvs(MACROBLOCKD *xd) {
   }
 }
 
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col) {
+  vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col);
+  vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride,
+                                       mb_row, mb_col);
+}
+
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
+                                   int mb_row,
+                                   int mb_col) {
   if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,
-                                           &xd->predictor[256],
-                                           &xd->predictor[320], 16, 8);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      /* 256 = offset of U plane in Y+U+V buffer;
-       * 320 = offset of V plane in Y+U+V buffer.
-       * (256=16x16, 320=16x16+8x8). */
-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,
-                                             &xd->predictor[256],
-                                             &xd->predictor[320], 16, 8);
-    }
+    vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
+                                       &xd->predictor[256],
+                                       &xd->predictor[320], 16, 8,
+                                       mb_row, mb_col);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
       vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor,
                                                &xd->predictor[256],
                                                &xd->predictor[320], 16, 8);
@@ -905,6 +935,6 @@ void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {
 #endif
   } else {
     build_4x4uvmvs(xd);
-    build_inter4x4_predictors_mb(xd);
+    build_inter4x4_predictors_mb(xd, mb_row, mb_col);
   }
 }
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 903bd2e86..831ce2a73 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -16,71 +16,126 @@
 
 struct subpix_fn_table;
 
-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    uint8_t *dst_y,
-                                                    int dst_ystride,
-                                                    int clamp_mvs);
-
-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   uint8_t *dst_y,
-                                                   uint8_t *dst_u,
-                                                   uint8_t *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    uint8_t *dst_y,
-                                                    int dst_ystride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     uint8_t *dst_u,
-                                                     uint8_t *dst_v,
-                                                     int dst_uvstride);
-
-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   uint8_t *dst_y,
-                                                   uint8_t *dst_u,
-                                                   uint8_t *dst_v,
-                                                   int dst_ystride,
-                                                   int dst_uvstride);
-
-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                               uint8_t *dst_y,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_ystride,
-                                               int dst_uvstride);
-
-extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
-                                               uint8_t *dst_y,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_ystride,
-                                               int dst_uvstride);
-
-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
-
-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         struct subpix_fn_table *sppf);
-
-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
-                                             struct subpix_fn_table *sppf);
-
-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
-                                         int pitch);
-
-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
-                                             BLOCKD *d, int pitch);
-
-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);
-
-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                                     INTERPOLATIONFILTERTYPE filter,
-                                     VP9_COMMON *cm);
+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
+                                         uint8_t *dst_y,
+                                         int dst_ystride,
+                                         int mb_row,
+                                         int mb_col);
+
+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                          uint8_t *dst_u,
+                                          uint8_t *dst_v,
+                                          int dst_uvstride,
+                                          int mb_row,
+                                          int mb_col);
+
+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
+                                   int mb_row,
+                                   int mb_col);
+
+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
+                                        int mb_row,
+                                        int mb_col);
+
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATIONFILTERTYPE filter,
+                              VP9_COMMON *cm);
+
+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+                                       YV12_BUFFER_CONFIG *other,
+                                       int this_w, int this_h);
+
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const int_mv *mv_q3,
+                               const struct scale_factors *scale,
+                               int w, int h, int do_avg,
+                               const struct subpix_fn_table *subpix);
+
+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride,
+                                  const int_mv *fullpel_mv_q3,
+                                  const int_mv *frac_mv_q4,
+                                  const struct scale_factors *scale,
+                                  int w, int h, int do_avg,
+                                  const struct subpix_fn_table *subpix);
+
+static int scale_value_x(int val, const struct scale_factors *scale) {
+  return val * scale->x_num / scale->x_den;
+}
+
+static int scale_value_y(int val, const struct scale_factors *scale) {
+  return val * scale->y_num / scale->y_den;
+}
+
+static int scaled_buffer_offset(int x_offset,
+                                int y_offset,
+                                int stride,
+                                const struct scale_factors *scale) {
+  return scale_value_y(y_offset, scale) * stride +
+      scale_value_x(x_offset, scale);
+}
+
+static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
+                             const YV12_BUFFER_CONFIG *src,
+                             int mb_row, int mb_col,
+                             const struct scale_factors *scale,
+                             const struct scale_factors *scale_uv) {
+  const int recon_y_stride = src->y_stride;
+  const int recon_uv_stride = src->uv_stride;
+  int recon_yoffset;
+  int recon_uvoffset;
+
+  if (scale) {
+    recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row,
+                                         recon_y_stride, scale);
+    recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row,
+                                          recon_uv_stride, scale_uv);
+  } else {
+    recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
+    recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
+  }
+  *dst = *src;
+  dst->y_buffer += recon_yoffset;
+  dst->u_buffer += recon_uvoffset;
+  dst->v_buffer += recon_uvoffset;
+}
+
+static void set_scale_factors(MACROBLOCKD *xd,
+    int ref0, int ref1,
+    struct scale_factors scale_factor[MAX_REF_FRAMES]) {
+
+  xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];
+  xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];
+  xd->scale_factor_uv[0] = xd->scale_factor[0];
+  xd->scale_factor_uv[1] = xd->scale_factor[1];
+}
 
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 88584ad3b..3031fb699 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,37 +14,43 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
-extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                                     int stride, int n);
-extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
+
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n);
+
+B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+
 #if CONFIG_COMP_INTERINTRA_PRED
-extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
-extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                                      uint8_t *ypred,
-                                                      int ystride);
-extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                       uint8_t *upred,
-                                                       uint8_t *vpred,
-                                                       int uvstride);
+void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
+
+void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
+                                               uint8_t *ypred,
+                                               int ystride);
+
+void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
+                                                int uvstride);
 #endif  // CONFIG_COMP_INTERINTRA_PRED
 
-extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
-extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                                     uint8_t *ypred,
-                                                     uint8_t *upred,
-                                                     uint8_t *vpred,
-                                                     int ystride,
-                                                     int uvstride);
+void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
+
+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride);
 
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index 277d5b217..72613ae07 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c
@@ -12,10 +12,9 @@
 #include "vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-extern void vpx_scale_rtcd(void);
+void vpx_scale_rtcd(void);
 
-void vp9_rtcd()
-{
+void vp9_rtcd() {
     vpx_scale_rtcd();
     once(setup_rtcd_internal);
 }
diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h
index 457265528..e389f3c91 100644
--- a/vp9/common/vp9_setupintrarecon.h
+++ b/vp9/common/vp9_setupintrarecon.h
@@ -13,6 +13,6 @@
 
 #include "vpx_scale/yv12config.h"
 
-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
 
 #endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index a1225f1dc..5893c1132 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -12,6 +12,7 @@
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_common.h"
@@ -697,6 +698,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
   int mb_to_top_edge;
   int mb_to_bottom_edge;
   const int mb_size = 1 << mi->mbmi.sb_type;
+  const int use_prev_in_find_mv_refs = cm->Width == cm->last_width &&
+                                       cm->Height == cm->last_height &&
+                                       !cm->error_resilient_mode;
 
   mb_to_top_edge = xd->mb_to_top_edge;
   mb_to_bottom_edge = xd->mb_to_bottom_edge;
@@ -749,25 +753,22 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
     int_mv nearest_second, nearby_second, best_mv_second;
     vp9_prob mv_ref_p [VP9_MVREFS - 1];
 
-    int recon_y_stride, recon_yoffset;
-    int recon_uv_stride, recon_uvoffset;
     MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
 
     {
       int ref_fb_idx;
+      const int use_prev_in_find_best_ref =
+          xd->scale_factor[0].x_num == xd->scale_factor[0].x_den &&
+          xd->scale_factor[0].y_num == xd->scale_factor[0].y_den &&
+          !cm->error_resilient_mode &&
+          !cm->frame_parallel_decoding_mode;
 
       /* Select the appropriate reference frame for this MB */
       ref_fb_idx = cm->active_ref_idx[ref_frame - 1];
 
-      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;
-      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],
+          mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
 #ifdef DEC_DEBUG
       if (dec_debug)
@@ -776,7 +777,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 #endif
       // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5)
       //  printf("Dello\n");
-      vp9_find_mv_refs(cm, xd, mi, cm->error_resilient_mode ? 0 : prev_mi,
+      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
                        ref_frame, mbmi->ref_mvs[ref_frame],
                        cm->ref_frame_sign_bias);
 
@@ -809,10 +810,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
 
       if (mbmi->mode != ZEROMV) {
         vp9_find_best_ref_mvs(xd,
-                              pbi->common.error_resilient_mode ||
-                              pbi->common.frame_parallel_decoding_mode ?
-                              0 : xd->pre.y_buffer,
-                              recon_y_stride,
+                              use_prev_in_find_best_ref ?
+                                  xd->pre.y_buffer : NULL,
+                              xd->pre.y_stride,
                               mbmi->ref_mvs[ref_frame],
                               &nearest, &nearby);
 
@@ -853,27 +853,31 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         mbmi->second_ref_frame = 1;
       if (mbmi->second_ref_frame > 0) {
         int second_ref_fb_idx;
+        int use_prev_in_find_best_ref;
+
+        xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];
+        use_prev_in_find_best_ref =
+            xd->scale_factor[1].x_num == xd->scale_factor[1].x_den &&
+            xd->scale_factor[1].y_num == xd->scale_factor[1].y_den &&
+            !cm->error_resilient_mode &&
+            !cm->frame_parallel_decoding_mode;
+
         /* Select the appropriate reference frame for this MB */
         second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
 
-        xd->second_pre.y_buffer =
-          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
+            mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]);
 
-        vp9_find_mv_refs(cm, xd, mi, cm->error_resilient_mode ? 0 : prev_mi,
+        vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
                          mbmi->second_ref_frame,
                          mbmi->ref_mvs[mbmi->second_ref_frame],
                          cm->ref_frame_sign_bias);
 
         if (mbmi->mode != ZEROMV) {
           vp9_find_best_ref_mvs(xd,
-                                pbi->common.error_resilient_mode ||
-                                pbi->common.frame_parallel_decoding_mode ?
-                                0 : xd->second_pre.y_buffer,
-                                recon_y_stride,
+                                use_prev_in_find_best_ref ?
+                                    xd->second_pre.y_buffer : NULL,
+                                xd->second_pre.y_stride,
                                 mbmi->ref_mvs[mbmi->second_ref_frame],
                                 &nearest_second,
                                 &nearby_second);
@@ -1089,7 +1093,6 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
         break;
 
       case NEWMV:
-
         read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
         read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
                     xd->allow_high_precision_mv);
@@ -1230,8 +1233,12 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
   MODE_INFO *mi = xd->mode_info_context;
   MODE_INFO *prev_mi = xd->prev_mode_info_context;
 
-  if (pbi->common.frame_type == KEY_FRAME)
+  if (pbi->common.frame_type == KEY_FRAME) {
     kfread_modes(pbi, mi, mb_row, mb_col, bc);
-  else
+  } else {
     read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
+    set_scale_factors(xd,
+                      mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
+                      pbi->common.active_ref_scale);
+  }
 }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 05a1bf9e0..eefdbb92b 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -147,7 +147,8 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
 /* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
  *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
  */
-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                          int mb_row, int mb_col) {
   BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
 
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
@@ -168,32 +169,26 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
     } else if (sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_inter32x32_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
     } else {
-      vp9_build_1st_inter16x16_predictors_mb(xd,
-                                             xd->dst.y_buffer,
-                                             xd->dst.u_buffer,
-                                             xd->dst.v_buffer,
-                                             xd->dst.y_stride,
-                                             xd->dst.uv_stride);
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        vp9_build_2nd_inter16x16_predictors_mb(xd,
-                                               xd->dst.y_buffer,
-                                               xd->dst.u_buffer,
-                                               xd->dst.v_buffer,
-                                               xd->dst.y_stride,
-                                               xd->dst.uv_stride);
-      }
+      vp9_build_inter16x16_predictors_mb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
         vp9_build_interintra_16x16_predictors_mb(xd,
                                                  xd->dst.y_buffer,
                                                  xd->dst.u_buffer,
@@ -608,7 +603,7 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      */
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 
@@ -619,7 +614,8 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
   } else {
     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   /* dequantization and idct */
@@ -729,7 +725,7 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
     /* Special case:  Force the loopfilter to skip when eobtotal and
      * mb_skip_coeff are zero.
      */
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 
@@ -740,7 +736,8 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
   } else {
     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   /* dequantization and idct */
@@ -841,7 +838,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
     /* Special case:  Force the loopfilter to skip when eobtotal and
        mb_skip_coeff are zero. */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-    skip_recon_mb(pbi, xd);
+    skip_recon_mb(pbi, xd, mb_row, mb_col);
     return;
   }
 #ifdef DEC_DEBUG
@@ -868,7 +865,7 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
            xd->mode_info_context->mbmi.mode, tx_size,
            xd->mode_info_context->mbmi.interp_filter);
 #endif
-    vp9_build_inter_predictors_mb(xd);
+    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   }
 
   if (tx_size == TX_16X16) {
@@ -975,18 +972,14 @@ static void set_refs(VP9D_COMP *pbi, int block_size,
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mbmi->ref_frame > INTRA_FRAME) {
-    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;
+    int ref_fb_idx;
 
     /* Select the appropriate reference frame for this MB */
     ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
-
-    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;
-    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;
-    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;
-    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;
-    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;
+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
+    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
+    setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     /* propagate errors from reference frames */
     xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
@@ -997,12 +990,9 @@ static void set_refs(VP9D_COMP *pbi, int block_size,
       /* Select the appropriate reference frame for this MB */
       second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
 
-      xd->second_pre.y_buffer =
-          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;
-      xd->second_pre.u_buffer =
-          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;
-      xd->second_pre.v_buffer =
-          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;
+      setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
 
       /* propagate errors from reference frames */
       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
@@ -1213,6 +1203,26 @@ static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
   }
 }
 
+static void update_frame_size(VP9D_COMP *pbi) {
+  VP9_COMMON *cm = &pbi->common;
+
+  /* our internal buffers are always multiples of 16 */
+  int width = (cm->Width + 15) & ~15;
+  int height = (cm->Height + 15) & ~15;
+
+  cm->mb_rows = height >> 4;
+  cm->mb_cols = width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+  memset(cm->mip, 0,
+        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
+  vp9_update_mode_info_border(cm, cm->mip);
+
+  cm->mi = cm->mip + cm->mode_info_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+  vp9_update_mode_info_in_image(cm, cm->mi);
+}
+
 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   BOOL_DECODER header_bc, residual_bc;
   VP9_COMMON *const pc = &pbi->common;
@@ -1290,9 +1300,25 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
                              "Invalid frame height");
         }
 
-        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
-          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                             "Failed to allocate frame buffers");
+        if (!pbi->initial_width || !pbi->initial_height) {
+          if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))
+            vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffers");
+          pbi->initial_width = pc->Width;
+          pbi->initial_height = pc->Height;
+        }
+
+        if (pc->Width > pbi->initial_width) {
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Frame width too large");
+        }
+
+        if (pc->Height > pbi->initial_height) {
+          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Frame height too large");
+        }
+
+        update_frame_size(pbi);
       }
     }
   }
@@ -1304,6 +1330,11 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   init_frame(pbi);
 
+  /* Reset the frame pointers to the current frame size */
+  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
+                                pc->mb_cols * 16, pc->mb_rows * 16,
+                                VP9BORDERINPIXELS);
+
   if (vp9_start_decode(&header_bc, data,
                        (unsigned int)first_partition_length_in_bytes))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
@@ -1736,6 +1767,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   }
   corrupt_tokens |= xd->corrupted;
 
+  // keep track of the last coded dimensions
+  pc->last_width = pc->Width;
+  pc->last_height = pc->Height;
+
   /* Collect information about decoder corruption. */
   /* 1. Check first boolean decoder for errors. */
   pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 0e205bdda..1d6c66afd 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -65,7 +65,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
   for (i = 0; i < 16; i++)
     input[i] = dq[i] * input[i];
 
-  vp9_short_iht4x4(input, output, 8, tx_type);
+  vp9_short_iht4x4(input, output, 4, tx_type);
   vpx_memset(input, 0, 32);
 
   add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
@@ -86,7 +86,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
     for (i = 1; i < 64; i++)
       input[i] *= dq[1];
 
-    vp9_short_iht8x8(input, output, 16, tx_type);
+    vp9_short_iht8x8(input, output, 8, tx_type);
     vpx_memset(input, 0, 128);
 
     add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
@@ -247,7 +247,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
       input[i] *= dq[1];
 
     // inverse hybrid transform
-    vp9_short_iht16x16(input, output, 32, tx_type);
+    vp9_short_iht16x16(input, output, 16, tx_type);
 
     // the idct halves ( >> 1) the pitch
     // vp9_short_idct16x16_c(input, output, 32);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 91042c4fe..599c5bb57 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -110,15 +110,12 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
     case TX_8X8:
       coef_probs  = fc->coef_probs_8x8;
       coef_counts = fc->coef_counts_8x8;
-#if CONFIG_CNVCONTEXT
       above_ec = (A0[aidx] + A0[aidx + 1]) != 0;
       left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;
-#endif
       break;
     case TX_16X16:
       coef_probs  = fc->coef_probs_16x16;
       coef_counts = fc->coef_counts_16x16;
-#if CONFIG_CNVCONTEXT
       if (type == PLANE_TYPE_UV) {
         ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
         ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
@@ -128,12 +125,10 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
         above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0;
         left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;
       }
-#endif
       break;
     case TX_32X32:
       coef_probs = fc->coef_probs_32x32;
       coef_counts = fc->coef_counts_32x32;
-#if CONFIG_CNVCONTEXT
       if (type == PLANE_TYPE_UV) {
         ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
         ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
@@ -153,7 +148,6 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
         left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] +
                     L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;
       }
-#endif
       break;
   }
 
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 8c1f76e73..0e6d059af 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -38,6 +38,8 @@ typedef struct VP9Decompressor {
 
   int decoded_key_frame;
 
+  int initial_width;
+  int initial_height;
 } VP9D_COMP;
 
 int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index e2f3e2677..080f4a70b 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -105,7 +105,6 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
   int16_t out[4 * 4];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -137,7 +136,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 4;
+      temp_in[j] = input[j * pitch + i] << 4;
     if (i == 0 && temp_in[0])
       temp_in[0] += 1;
     fwdc(temp_in, temp_out);
@@ -308,7 +307,6 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
                         int pitch, TX_TYPE tx_type) {
   int16_t out[64];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
@@ -339,7 +337,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 2;
+      temp_in[j] = input[j * pitch + i] << 2;
     fwdc(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
       outptr[j * 8 + i] = temp_out[j];
@@ -697,7 +695,6 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
                           int pitch, TX_TYPE tx_type) {
   int16_t out[256];
   int16_t *outptr = &out[0];
-  const int short_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -728,7 +725,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
   // column transform
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * short_pitch + i] << 2;
+      temp_in[j] = input[j * pitch + i] << 2;
     fwdc(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -744,412 +741,9 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
   }
 }
 
-#define TEST_INT_32x32_DCT 1
-
-#if !TEST_INT_32x32_DCT
-
-static void dct32_1d(double *input, double *output, int stride) {
-  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
-  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
-  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
-  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
-  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
-  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
-  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
-  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
-  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
-  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
-  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
-  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
-  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
-  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
-  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
-  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
-  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
-  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
-  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
-  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
-  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
-  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
-  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
-  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
-  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
-  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
-  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
-  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
-  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
-  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
-  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
-
-  double step[32];
-
-  // Stage 1
-  step[0] = input[stride*0] + input[stride*(32 - 1)];
-  step[1] = input[stride*1] + input[stride*(32 - 2)];
-  step[2] = input[stride*2] + input[stride*(32 - 3)];
-  step[3] = input[stride*3] + input[stride*(32 - 4)];
-  step[4] = input[stride*4] + input[stride*(32 - 5)];
-  step[5] = input[stride*5] + input[stride*(32 - 6)];
-  step[6] = input[stride*6] + input[stride*(32 - 7)];
-  step[7] = input[stride*7] + input[stride*(32 - 8)];
-  step[8] = input[stride*8] + input[stride*(32 - 9)];
-  step[9] = input[stride*9] + input[stride*(32 - 10)];
-  step[10] = input[stride*10] + input[stride*(32 - 11)];
-  step[11] = input[stride*11] + input[stride*(32 - 12)];
-  step[12] = input[stride*12] + input[stride*(32 - 13)];
-  step[13] = input[stride*13] + input[stride*(32 - 14)];
-  step[14] = input[stride*14] + input[stride*(32 - 15)];
-  step[15] = input[stride*15] + input[stride*(32 - 16)];
-  step[16] = -input[stride*16] + input[stride*(32 - 17)];
-  step[17] = -input[stride*17] + input[stride*(32 - 18)];
-  step[18] = -input[stride*18] + input[stride*(32 - 19)];
-  step[19] = -input[stride*19] + input[stride*(32 - 20)];
-  step[20] = -input[stride*20] + input[stride*(32 - 21)];
-  step[21] = -input[stride*21] + input[stride*(32 - 22)];
-  step[22] = -input[stride*22] + input[stride*(32 - 23)];
-  step[23] = -input[stride*23] + input[stride*(32 - 24)];
-  step[24] = -input[stride*24] + input[stride*(32 - 25)];
-  step[25] = -input[stride*25] + input[stride*(32 - 26)];
-  step[26] = -input[stride*26] + input[stride*(32 - 27)];
-  step[27] = -input[stride*27] + input[stride*(32 - 28)];
-  step[28] = -input[stride*28] + input[stride*(32 - 29)];
-  step[29] = -input[stride*29] + input[stride*(32 - 30)];
-  step[30] = -input[stride*30] + input[stride*(32 - 31)];
-  step[31] = -input[stride*31] + input[stride*(32 - 32)];
-
-  // Stage 2
-  output[stride*0] = step[0] + step[16 - 1];
-  output[stride*1] = step[1] + step[16 - 2];
-  output[stride*2] = step[2] + step[16 - 3];
-  output[stride*3] = step[3] + step[16 - 4];
-  output[stride*4] = step[4] + step[16 - 5];
-  output[stride*5] = step[5] + step[16 - 6];
-  output[stride*6] = step[6] + step[16 - 7];
-  output[stride*7] = step[7] + step[16 - 8];
-  output[stride*8] = -step[8] + step[16 - 9];
-  output[stride*9] = -step[9] + step[16 - 10];
-  output[stride*10] = -step[10] + step[16 - 11];
-  output[stride*11] = -step[11] + step[16 - 12];
-  output[stride*12] = -step[12] + step[16 - 13];
-  output[stride*13] = -step[13] + step[16 - 14];
-  output[stride*14] = -step[14] + step[16 - 15];
-  output[stride*15] = -step[15] + step[16 - 16];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17];
-  output[stride*18] = step[18];
-  output[stride*19] = step[19];
-
-  output[stride*20] = (-step[20] + step[27])*C16;
-  output[stride*21] = (-step[21] + step[26])*C16;
-  output[stride*22] = (-step[22] + step[25])*C16;
-  output[stride*23] = (-step[23] + step[24])*C16;
-
-  output[stride*24] = (step[24] + step[23])*C16;
-  output[stride*25] = (step[25] + step[22])*C16;
-  output[stride*26] = (step[26] + step[21])*C16;
-  output[stride*27] = (step[27] + step[20])*C16;
-
-  output[stride*28] = step[28];
-  output[stride*29] = step[29];
-  output[stride*30] = step[30];
-  output[stride*31] = step[31];
-
-  // Stage 3
-  step[0] = output[stride*0] + output[stride*(8 - 1)];
-  step[1] = output[stride*1] + output[stride*(8 - 2)];
-  step[2] = output[stride*2] + output[stride*(8 - 3)];
-  step[3] = output[stride*3] + output[stride*(8 - 4)];
-  step[4] = -output[stride*4] + output[stride*(8 - 5)];
-  step[5] = -output[stride*5] + output[stride*(8 - 6)];
-  step[6] = -output[stride*6] + output[stride*(8 - 7)];
-  step[7] = -output[stride*7] + output[stride*(8 - 8)];
-  step[8] = output[stride*8];
-  step[9] = output[stride*9];
-  step[10] = (-output[stride*10] + output[stride*13])*C16;
-  step[11] = (-output[stride*11] + output[stride*12])*C16;
-  step[12] = (output[stride*12] + output[stride*11])*C16;
-  step[13] = (output[stride*13] + output[stride*10])*C16;
-  step[14] = output[stride*14];
-  step[15] = output[stride*15];
-
-  step[16] = output[stride*16] + output[stride*23];
-  step[17] = output[stride*17] + output[stride*22];
-  step[18] = output[stride*18] + output[stride*21];
-  step[19] = output[stride*19] + output[stride*20];
-  step[20] = -output[stride*20] + output[stride*19];
-  step[21] = -output[stride*21] + output[stride*18];
-  step[22] = -output[stride*22] + output[stride*17];
-  step[23] = -output[stride*23] + output[stride*16];
-  step[24] = -output[stride*24] + output[stride*31];
-  step[25] = -output[stride*25] + output[stride*30];
-  step[26] = -output[stride*26] + output[stride*29];
-  step[27] = -output[stride*27] + output[stride*28];
-  step[28] = output[stride*28] + output[stride*27];
-  step[29] = output[stride*29] + output[stride*26];
-  step[30] = output[stride*30] + output[stride*25];
-  step[31] = output[stride*31] + output[stride*24];
-
-  // Stage 4
-  output[stride*0] = step[0] + step[3];
-  output[stride*1] = step[1] + step[2];
-  output[stride*2] = -step[2] + step[1];
-  output[stride*3] = -step[3] + step[0];
-  output[stride*4] = step[4];
-  output[stride*5] = (-step[5] + step[6])*C16;
-  output[stride*6] = (step[6] + step[5])*C16;
-  output[stride*7] = step[7];
-  output[stride*8] = step[8] + step[11];
-  output[stride*9] = step[9] + step[10];
-  output[stride*10] = -step[10] + step[9];
-  output[stride*11] = -step[11] + step[8];
-  output[stride*12] = -step[12] + step[15];
-  output[stride*13] = -step[13] + step[14];
-  output[stride*14] = step[14] + step[13];
-  output[stride*15] = step[15] + step[12];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17];
-  output[stride*18] = step[18]*-C8 + step[29]*C24;
-  output[stride*19] = step[19]*-C8 + step[28]*C24;
-  output[stride*20] = step[20]*-C24 + step[27]*-C8;
-  output[stride*21] = step[21]*-C24 + step[26]*-C8;
-  output[stride*22] = step[22];
-  output[stride*23] = step[23];
-  output[stride*24] = step[24];
-  output[stride*25] = step[25];
-  output[stride*26] = step[26]*C24 + step[21]*-C8;
-  output[stride*27] = step[27]*C24 + step[20]*-C8;
-  output[stride*28] = step[28]*C8 + step[19]*C24;
-  output[stride*29] = step[29]*C8 + step[18]*C24;
-  output[stride*30] = step[30];
-  output[stride*31] = step[31];
-
-  // Stage 5
-  step[0] = (output[stride*0] + output[stride*1]) * C16;
-  step[1] = (-output[stride*1] + output[stride*0]) * C16;
-  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
-  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
-  step[4] = output[stride*4] + output[stride*5];
-  step[5] = -output[stride*5] + output[stride*4];
-  step[6] = -output[stride*6] + output[stride*7];
-  step[7] = output[stride*7] + output[stride*6];
-  step[8] = output[stride*8];
-  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
-  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
-  step[11] = output[stride*11];
-  step[12] = output[stride*12];
-  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
-  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
-  step[15] = output[stride*15];
-
-  step[16] = output[stride*16] + output[stride*19];
-  step[17] = output[stride*17] + output[stride*18];
-  step[18] = -output[stride*18] + output[stride*17];
-  step[19] = -output[stride*19] + output[stride*16];
-  step[20] = -output[stride*20] + output[stride*23];
-  step[21] = -output[stride*21] + output[stride*22];
-  step[22] = output[stride*22] + output[stride*21];
-  step[23] = output[stride*23] + output[stride*20];
-  step[24] = output[stride*24] + output[stride*27];
-  step[25] = output[stride*25] + output[stride*26];
-  step[26] = -output[stride*26] + output[stride*25];
-  step[27] = -output[stride*27] + output[stride*24];
-  step[28] = -output[stride*28] + output[stride*31];
-  step[29] = -output[stride*29] + output[stride*30];
-  step[30] = output[stride*30] + output[stride*29];
-  step[31] = output[stride*31] + output[stride*28];
-
-  // Stage 6
-  output[stride*0] = step[0];
-  output[stride*1] = step[1];
-  output[stride*2] = step[2];
-  output[stride*3] = step[3];
-  output[stride*4] = step[4]*C28 + step[7]*C4;
-  output[stride*5] = step[5]*C12 + step[6]*C20;
-  output[stride*6] = step[6]*C12 + step[5]*-C20;
-  output[stride*7] = step[7]*C28 + step[4]*-C4;
-  output[stride*8] = step[8] + step[9];
-  output[stride*9] = -step[9] + step[8];
-  output[stride*10] = -step[10] + step[11];
-  output[stride*11] = step[11] + step[10];
-  output[stride*12] = step[12] + step[13];
-  output[stride*13] = -step[13] + step[12];
-  output[stride*14] = -step[14] + step[15];
-  output[stride*15] = step[15] + step[14];
-
-  output[stride*16] = step[16];
-  output[stride*17] = step[17]*-C4 + step[30]*C28;
-  output[stride*18] = step[18]*-C28 + step[29]*-C4;
-  output[stride*19] = step[19];
-  output[stride*20] = step[20];
-  output[stride*21] = step[21]*-C20 + step[26]*C12;
-  output[stride*22] = step[22]*-C12 + step[25]*-C20;
-  output[stride*23] = step[23];
-  output[stride*24] = step[24];
-  output[stride*25] = step[25]*C12 + step[22]*-C20;
-  output[stride*26] = step[26]*C20 + step[21]*C12;
-  output[stride*27] = step[27];
-  output[stride*28] = step[28];
-  output[stride*29] = step[29]*C28 + step[18]*-C4;
-  output[stride*30] = step[30]*C4 + step[17]*C28;
-  output[stride*31] = step[31];
-
-  // Stage 7
-  step[0] = output[stride*0];
-  step[1] = output[stride*1];
-  step[2] = output[stride*2];
-  step[3] = output[stride*3];
-  step[4] = output[stride*4];
-  step[5] = output[stride*5];
-  step[6] = output[stride*6];
-  step[7] = output[stride*7];
-  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
-  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
-  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
-  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
-  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
-  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
-  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
-  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
-
-  step[16] = output[stride*16] + output[stride*17];
-  step[17] = -output[stride*17] + output[stride*16];
-  step[18] = -output[stride*18] + output[stride*19];
-  step[19] = output[stride*19] + output[stride*18];
-  step[20] = output[stride*20] + output[stride*21];
-  step[21] = -output[stride*21] + output[stride*20];
-  step[22] = -output[stride*22] + output[stride*23];
-  step[23] = output[stride*23] + output[stride*22];
-  step[24] = output[stride*24] + output[stride*25];
-  step[25] = -output[stride*25] + output[stride*24];
-  step[26] = -output[stride*26] + output[stride*27];
-  step[27] = output[stride*27] + output[stride*26];
-  step[28] = output[stride*28] + output[stride*29];
-  step[29] = -output[stride*29] + output[stride*28];
-  step[30] = -output[stride*30] + output[stride*31];
-  step[31] = output[stride*31] + output[stride*30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[stride*0] = step[0];
-  output[stride*16] = step[1];
-  output[stride*8] = step[2];
-  output[stride*24] = step[3];
-  output[stride*4] = step[4];
-  output[stride*20] = step[5];
-  output[stride*12] = step[6];
-  output[stride*28] = step[7];
-  output[stride*2] = step[8];
-  output[stride*18] = step[9];
-  output[stride*10] = step[10];
-  output[stride*26] = step[11];
-  output[stride*6] = step[12];
-  output[stride*22] = step[13];
-  output[stride*14] = step[14];
-  output[stride*30] = step[15];
-
-  output[stride*1] = step[16]*C31 + step[31]*C1;
-  output[stride*17] = step[17]*C15 + step[30]*C17;
-  output[stride*9] = step[18]*C23 + step[29]*C9;
-  output[stride*25] = step[19]*C7 + step[28]*C25;
-  output[stride*5] = step[20]*C27 + step[27]*C5;
-  output[stride*21] = step[21]*C11 + step[26]*C21;
-  output[stride*13] = step[22]*C19 + step[25]*C13;
-  output[stride*29] = step[23]*C3 + step[24]*C29;
-  output[stride*3] = step[24]*C3 + step[23]*-C29;
-  output[stride*19] = step[25]*C19 + step[22]*-C13;
-  output[stride*11] = step[26]*C11 + step[21]*-C21;
-  output[stride*27] = step[27]*C27 + step[20]*-C5;
-  output[stride*7] = step[28]*C7 + step[19]*-C25;
-  output[stride*23] = step[29]*C23 + step[18]*-C9;
-  output[stride*15] = step[30]*C15 + step[17]*-C17;
-  output[stride*31] = step[31]*C31 + step[16]*-C1;
-}
-
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-  {
-    int shortpitch = pitch >> 1;
-    int i, j;
-    double output[1024];
-    // First transform columns
-    for (i = 0; i < 32; i++) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; j++)
-        temp_in[j] = input[j*shortpitch + i];
-      dct32_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; j++)
-        output[j*32 + i] = temp_out[j];
-    }
-    // Then transform rows
-    for (i = 0; i < 32; ++i) {
-      double temp_in[32], temp_out[32];
-      for (j = 0; j < 32; ++j)
-        temp_in[j] = output[j + i*32];
-      dct32_1d(temp_in, temp_out, 1);
-      for (j = 0; j < 32; ++j)
-        output[j + i*32] = temp_out[j];
-    }
-    // Scale by some magic number
-    for (i = 0; i < 1024; i++) {
-      out[i] = (short)round(output[i]/4);
-    }
-  }
-
-  vp9_clear_system_state();  // Make it simd safe : __asm emms;
-}
-
-#else
-
-#define RIGHT_SHIFT 13
-#define ROUNDING (1 << (RIGHT_SHIFT - 1))
-
-static void dct32_1d(int *input, int *output, int last_shift_bits) {
-  static const int16_t C1 = 8182;    // 2^13
-  static const int16_t C2 = 8153;
-  static const int16_t C3 = 8103;
-  static const int16_t C4 = 8035;
-  static const int16_t C5 = 7946;
-  static const int16_t C6 = 7839;
-  static const int16_t C7 = 7713;
-  static const int16_t C8 = 7568;
-  static const int16_t C9 = 7405;
-  static const int16_t C10 = 7225;
-  static const int16_t C11 = 7027;
-  static const int16_t C12 = 6811;
-  static const int16_t C13 = 6580;
-  static const int16_t C14 = 6333;
-  static const int16_t C15 = 6070;
-  static const int16_t C16 = 5793;
-  static const int16_t C17 = 5501;
-  static const int16_t C18 = 5197;
-  static const int16_t C19 = 4880;
-  static const int16_t C20 = 4551;
-  static const int16_t C21 = 4212;
-  static const int16_t C22 = 3862;
-  static const int16_t C23 = 3503;
-  static const int16_t C24 = 3135;
-  static const int16_t C25 = 2760;
-  static const int16_t C26 = 2378;
-  static const int16_t C27 = 1990;
-  static const int16_t C28 = 1598;
-  static const int16_t C29 = 1202;
-  static const int16_t C30 = 803;
-  static const int16_t C31 = 402;
 
+static void dct32_1d(int *input, int *output) {
   int step[32];
-
-  int last_rounding = 0;
-  int final_shift = RIGHT_SHIFT;
-  int final_rounding = 0;
-
-  if (last_shift_bits > 0)
-    last_rounding = 1 << (last_shift_bits - 1);
-
-  final_shift += last_shift_bits;
-  if (final_shift > 0)
-    final_rounding = 1 << (final_shift - 1);
-
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
   step[1] = input[1] + input[(32 - 2)];
@@ -1207,15 +801,15 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   output[18] = step[18];
   output[19] = step[19];
 
-  output[20] = ((-step[20] + step[27]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[21] = ((-step[21] + step[26]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[22] = ((-step[22] + step[25]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[23] = ((-step[23] + step[24]) * C16 + ROUNDING) >> RIGHT_SHIFT;
+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
 
-  output[24] = ((step[24] + step[23]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[25] = ((step[25] + step[22]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[26] = ((step[26] + step[21]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[27] = ((step[27] + step[20]) * C16 + ROUNDING) >> RIGHT_SHIFT;
+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
 
   output[28] = step[28];
   output[29] = step[29];
@@ -1233,10 +827,10 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   step[7] = -output[7] + output[(8 - 8)];
   step[8] = output[8];
   step[9] = output[9];
-  step[10] = ((-output[10] + output[13]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  step[11] = ((-output[11] + output[12]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  step[12] = ((output[12] + output[11]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  step[13] = ((output[13] + output[10]) * C16 + ROUNDING) >> RIGHT_SHIFT;
+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
   step[14] = output[14];
   step[15] = output[15];
 
@@ -1263,8 +857,8 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   output[2] = -step[2] + step[1];
   output[3] = -step[3] + step[0];
   output[4] = step[4];
-  output[5] = ((-step[5] + step[6]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  output[6] = ((step[6] + step[5]) * C16 + ROUNDING) >> RIGHT_SHIFT;
+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
   output[7] = step[7];
   output[8] = step[8] + step[11];
   output[9] = step[9] + step[10];
@@ -1277,37 +871,37 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
 
   output[16] = step[16];
   output[17] = step[17];
-  output[18] = (step[18] * -C8 + step[29] * C24 + ROUNDING) >> RIGHT_SHIFT;
-  output[19] = (step[19] * -C8 + step[28] * C24 + ROUNDING) >> RIGHT_SHIFT;
-  output[20] = (step[20] * -C24 + step[27] * -C8 + ROUNDING) >> RIGHT_SHIFT;
-  output[21] = (step[21] * -C24 + step[26] * -C8 + ROUNDING) >> RIGHT_SHIFT;
+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
   output[22] = step[22];
   output[23] = step[23];
   output[24] = step[24];
   output[25] = step[25];
-  output[26] = (step[26] * C24 + step[21] * -C8 + ROUNDING) >> RIGHT_SHIFT;
-  output[27] = (step[27] * C24 + step[20] * -C8 + ROUNDING) >> RIGHT_SHIFT;
-  output[28] = (step[28] * C8 + step[19] * C24 + ROUNDING) >> RIGHT_SHIFT;
-  output[29] = (step[29] * C8 + step[18] * C24 + ROUNDING) >> RIGHT_SHIFT;
+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
   output[30] = step[30];
   output[31] = step[31];
 
   // Stage 5
-  step[0] = ((output[0] + output[1]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  step[1] = ((-output[1] + output[0]) * C16 + ROUNDING) >> RIGHT_SHIFT;
-  step[2] = (output[2] * C24 + output[3] * C8 + ROUNDING) >> RIGHT_SHIFT;
-  step[3] = (output[3] * C24 - output[2] * C8 + ROUNDING) >> RIGHT_SHIFT;
+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
   step[4] = output[4] + output[5];
   step[5] = -output[5] + output[4];
   step[6] = -output[6] + output[7];
   step[7] = output[7] + output[6];
   step[8] = output[8];
-  step[9] = (output[9] * -C8 + output[14] * C24 + ROUNDING) >> RIGHT_SHIFT;
-  step[10] = (output[10] * -C24 + output[13] * -C8 + ROUNDING) >> RIGHT_SHIFT;
+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
   step[11] = output[11];
   step[12] = output[12];
-  step[13] = (output[13] * C24 + output[10] * -C8 + ROUNDING) >> RIGHT_SHIFT;
-  step[14] = (output[14] * C8 + output[9] * C24 + ROUNDING) >> RIGHT_SHIFT;
+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
   step[15] = output[15];
 
   step[16] = output[16] + output[19];
@@ -1332,10 +926,10 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   output[1] = step[1];
   output[2] = step[2];
   output[3] = step[3];
-  output[4] = (step[4] * C28 + step[7] * C4 + ROUNDING) >> RIGHT_SHIFT;
-  output[5] = (step[5] * C12 + step[6] * C20 + ROUNDING) >> RIGHT_SHIFT;
-  output[6] = (step[6] * C12 + step[5] * -C20 + ROUNDING) >> RIGHT_SHIFT;
-  output[7] = (step[7] * C28 + step[4] * -C4 + ROUNDING) >> RIGHT_SHIFT;
+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
   output[8] = step[8] + step[9];
   output[9] = -step[9] + step[8];
   output[10] = -step[10] + step[11];
@@ -1346,20 +940,20 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   output[15] = step[15] + step[14];
 
   output[16] = step[16];
-  output[17] = (step[17] * -C4 + step[30] * C28 + ROUNDING) >> RIGHT_SHIFT;
-  output[18] = (step[18] * -C28 + step[29] * -C4 + ROUNDING) >> RIGHT_SHIFT;
+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
   output[19] = step[19];
   output[20] = step[20];
-  output[21] = (step[21] * -C20 + step[26] * C12 + ROUNDING) >> RIGHT_SHIFT;
-  output[22] = (step[22] * -C12 + step[25] * -C20 + ROUNDING) >> RIGHT_SHIFT;
+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
   output[23] = step[23];
   output[24] = step[24];
-  output[25] = (step[25] * C12 + step[22] * -C20 + ROUNDING) >> RIGHT_SHIFT;
-  output[26] = (step[26] * C20 + step[21] * C12 + ROUNDING) >> RIGHT_SHIFT;
+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
   output[27] = step[27];
   output[28] = step[28];
-  output[29] = (step[29] * C28 + step[18] * -C4 + ROUNDING) >> RIGHT_SHIFT;
-  output[30] = (step[30] * C4 + step[17] * C28 + ROUNDING) >> RIGHT_SHIFT;
+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
   output[31] = step[31];
 
   // Stage 7
@@ -1371,14 +965,14 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   step[5] = output[5];
   step[6] = output[6];
   step[7] = output[7];
-  step[8] = (output[8] * C30 + output[15] * C2 + ROUNDING) >> RIGHT_SHIFT;
-  step[9] = (output[9] * C14 + output[14] * C18 + ROUNDING) >> RIGHT_SHIFT;
-  step[10] = (output[10] * C22 + output[13] * C10 + ROUNDING) >> RIGHT_SHIFT;
-  step[11] = (output[11] * C6 + output[12] * C26 + ROUNDING) >> RIGHT_SHIFT;
-  step[12] = (output[12] * C6 + output[11] * -C26 + ROUNDING) >> RIGHT_SHIFT;
-  step[13] = (output[13] * C22 + output[10] * -C10 + ROUNDING) >> RIGHT_SHIFT;
-  step[14] = (output[14] * C14 + output[9] * -C18 + ROUNDING) >> RIGHT_SHIFT;
-  step[15] = (output[15] * C30 + output[8] * -C2 + ROUNDING) >> RIGHT_SHIFT;
+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
 
   step[16] = output[16] + output[17];
   step[17] = -output[17] + output[16];
@@ -1398,62 +992,40 @@ static void dct32_1d(int *input, int *output, int last_shift_bits) {
   step[31] = output[31] + output[30];
 
   // Final stage --- outputs indices are bit-reversed.
-  output[0] = (step[0] + last_rounding) >> last_shift_bits;
-  output[16] = (step[1] + last_rounding) >> last_shift_bits;
-  output[8] = (step[2] + last_rounding) >> last_shift_bits;
-  output[24] = (step[3] + last_rounding) >> last_shift_bits;
-  output[4] = (step[4] + last_rounding) >> last_shift_bits;
-  output[20] = (step[5] + last_rounding) >> last_shift_bits;
-  output[12] = (step[6] + last_rounding) >> last_shift_bits;
-  output[28] = (step[7] + last_rounding) >> last_shift_bits;
-  output[2] = (step[8] + last_rounding) >> last_shift_bits;
-  output[18] = (step[9] + last_rounding) >> last_shift_bits;
-  output[10] = (step[10] + last_rounding) >> last_shift_bits;
-  output[26] = (step[11] + last_rounding) >> last_shift_bits;
-  output[6] = (step[12] + last_rounding) >> last_shift_bits;
-  output[22] = (step[13] + last_rounding) >> last_shift_bits;
-  output[14] = (step[14] + last_rounding) >> last_shift_bits;
-  output[30] = (step[15] + last_rounding) >> last_shift_bits;
-
-  output[1] = (step[16] * C31 + step[31] * C1 + final_rounding) >> final_shift;
-  output[17] = (step[17] * C15 + step[30] * C17 + final_rounding)
-      >> final_shift;
-  output[9] = (step[18] * C23 + step[29] * C9 + final_rounding) >> final_shift;
-  output[25] = (step[19] * C7 + step[28] * C25 + final_rounding) >> final_shift;
-  output[5] = (step[20] * C27 + step[27] * C5 + final_rounding) >> final_shift;
-  output[21] = (step[21] * C11 + step[26] * C21 + final_rounding)
-      >> final_shift;
-  output[13] = (step[22] * C19 + step[25] * C13 + final_rounding)
-      >> final_shift;
-  output[29] = (step[23] * C3 + step[24] * C29 + final_rounding) >> final_shift;
-  output[3] = (step[24] * C3 + step[23] * -C29 + final_rounding) >> final_shift;
-  output[19] = (step[25] * C19 + step[22] * -C13 + final_rounding)
-      >> final_shift;
-  output[11] = (step[26] * C11 + step[21] * -C21 + final_rounding)
-      >> final_shift;
-  output[27] = (step[27] * C27 + step[20] * -C5 + final_rounding)
-      >> final_shift;
-  output[7] = (step[28] * C7 + step[19] * -C25 + final_rounding) >> final_shift;
-  output[23] = (step[29] * C23 + step[18] * -C9 + final_rounding)
-      >> final_shift;
-  output[15] = (step[30] * C15 + step[17] * -C17 + final_rounding)
-      >> final_shift;
-  output[31] = (step[31] * C31 + step[16] * -C1 + final_rounding)
-      >> final_shift;
-
-  // Clamp to fit 16-bit.
-  if (last_shift_bits > 0) {
-    int i;
-
-    for (i = 0; i < 32; i++)
-      if (output[i] < -32768)
-        output[i] = -32768;
-      else if (output[i] > 32767)
-        output[i] = 32767;
-  }
+  output[0]  = step[0];
+  output[16] = step[1];
+  output[8]  = step[2];
+  output[24] = step[3];
+  output[4]  = step[4];
+  output[20] = step[5];
+  output[12] = step[6];
+  output[28] = step[7];
+  output[2]  = step[8];
+  output[18] = step[9];
+  output[10] = step[10];
+  output[26] = step[11];
+  output[6]  = step[12];
+  output[22] = step[13];
+  output[14] = step[14];
+  output[30] = step[15];
+
+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
-#undef RIGHT_SHIFT
-#undef ROUNDING
 
 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
   int shortpitch = pitch >> 1;
@@ -1463,10 +1035,10 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
   for (i = 0; i < 32; i++) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; j++)
-      temp_in[j] = input[j * shortpitch + i];
-    dct32_1d(temp_in, temp_out, 0);
+      temp_in[j] = input[j * shortpitch + i] << 2;
+    dct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; j++)
-      output[j * 32 + i] = temp_out[j];
+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Then transform rows
@@ -1474,10 +1046,9 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 2);
+    dct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] = temp_out[j];
+      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 }
 
-#endif
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 880555797..c0fe5ac76 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -654,7 +654,7 @@ static void set_offsets(VP9_COMP *cpi,
   // Set up destination pointers
   setup_pred_block(&xd->dst,
                    &cm->yv12_fb[dst_fb_idx],
-                   mb_row, mb_col);
+                   mb_row, mb_col, NULL, NULL);
 
   /* Set up limit values for MV components to prevent them from
    * extending beyond the UMV borders assuming 16x16 block size */
@@ -679,7 +679,7 @@ static void set_offsets(VP9_COMP *cpi,
   xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
 
   /* set up source buffers */
-  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col);
+  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
 
   /* R/D setup */
   x->rddiv = cpi->RDDIV;
@@ -1187,7 +1187,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Copy data over into macro block data structures.
   x->src = *cpi->Source;
-  xd->pre = cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]];
+  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
   // set up frame for intra coded blocks
@@ -1272,9 +1272,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   totalrate = 0;
 
-  // Functions setup for all frame types so we can use MC in AltRef
-  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);
-
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
@@ -2092,55 +2089,50 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col);
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (mbmi->second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col);
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     if (!x->skip) {
-      vp9_encode_inter16x16(x);
+      vp9_encode_inter16x16(x, mb_row, mb_col);
 
       // Clear mb_skip_coeff if mb_no_coeff_skip is not set
       if (!cpi->common.mb_no_coeff_skip)
         mbmi->mb_skip_coeff = 0;
 
     } else {
-      vp9_build_1st_inter16x16_predictors_mb(xd,
-                                             xd->dst.y_buffer,
-                                             xd->dst.u_buffer,
-                                             xd->dst.v_buffer,
-                                             xd->dst.y_stride,
-                                             xd->dst.uv_stride);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        vp9_build_2nd_inter16x16_predictors_mb(xd,
-                                               xd->dst.y_buffer,
-                                               xd->dst.u_buffer,
-                                               xd->dst.v_buffer,
-                                               xd->dst.y_stride,
-                                               xd->dst.uv_stride);
-      }
+      vp9_build_inter16x16_predictors_mb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
         vp9_build_interintra_16x16_predictors_mb(xd,
                                                  xd->dst.y_buffer,
                                                  xd->dst.u_buffer,
@@ -2327,34 +2319,37 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col);
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col);
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
@@ -2553,34 +2548,37 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col);
+                     mb_row, mb_col,
+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col);
+                       mb_row, mb_col,
+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
     }
 
     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
                                        xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride);
+                                       xd->dst.y_stride, xd->dst.uv_stride,
+                                       mb_row, mb_col);
   }
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index ef64db1db..43bb4640c 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -54,9 +54,9 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
 
   tx_type = get_tx_type_4x4(&x->e_mbd, b);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
     vp9_ht_quantize_b_4x4(be, b, tx_type);
-    vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
   } else {
     x->fwd_txm4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
@@ -149,10 +149,10 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
 
     tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 32, tx_type);
+      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
       vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
-                            32, tx_type);
+                            16, tx_type);
     } else {
       x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
@@ -164,9 +164,9 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
       be = &x->block[ib + iblock[i]];
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
         vp9_ht_quantize_b_4x4(be, b, tx_type);
-        vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);
+        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
         x->fwd_txm8x4(be->src_diff, be->coeff, 32);
         x->quantize_b_4x4_pair(be, be + 1, b, b + 1);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 9ff5dd96a..ee08d263c 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -174,7 +174,7 @@ void vp9_transform_mby_4x4(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(b->src_diff, b->coeff, 32, tx_type);
+      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);
     } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) {
       x->fwd_txm8x4(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
@@ -209,7 +209,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, b->coeff, 32, tx_type);
+      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);
     } else {
       x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i].coeff[0], 32);
@@ -219,7 +219,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) {
     BLOCK *b = &x->block[i];
     tx_type = get_tx_type_8x8(xd, &xd->block[i]);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 32, tx_type);
+      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);
     } else {
       x->fwd_txm8x8(&x->block[i].src_diff[0],
                            &x->block[i + 2].coeff[0], 32);
@@ -247,7 +247,7 @@ void vp9_transform_mby_16x16(MACROBLOCK *x) {
   TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
   vp9_clear_system_state();
   if (tx_type != DCT_DCT) {
-    vp9_short_fht16x16(b->src_diff, b->coeff, 32, tx_type);
+    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);
   } else {
     x->fwd_txm16x16(&x->block[0].src_diff[0],
                            &x->block[0].coeff[0], 32);
@@ -597,13 +597,8 @@ void vp9_optimize_mby_8x8(MACROBLOCK *x) {
   for (b = 0; b < 16; b += 4) {
     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-#if CONFIG_CNVCONTEXT
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-    ENTROPY_CONTEXT above_ec = a[0];
-    ENTROPY_CONTEXT left_ec = l[0];
-#endif
     optimize_b(x, b, PLANE_TYPE_Y_WITH_DC, &above_ec, &left_ec, TX_8X8);
     a[1] = a[0] = above_ec;
     l[1] = l[0] = left_ec;
@@ -621,13 +616,8 @@ void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
   for (b = 16; b < 24; b += 4) {
     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-#if CONFIG_CNVCONTEXT
     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-#else
-    ENTROPY_CONTEXT above_ec = a[0];
-    ENTROPY_CONTEXT left_ec = l[0];
-#endif
     optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8);
   }
 }
@@ -645,13 +635,8 @@ void vp9_optimize_mby_16x16(MACROBLOCK *x) {
   if (!t_above || !t_left)
     return;
 
-#if CONFIG_CNVCONTEXT
   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-#else
-  ta = t_above->y1[0];
-  tl = t_left->y1[0];
-#endif
   optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16);
 }
 
@@ -699,21 +684,21 @@ void vp9_fidct_mb(MACROBLOCK *x) {
   }
 }
 
-void vp9_encode_inter16x16(MACROBLOCK *x) {
+void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  vp9_build_inter_predictors_mb(xd);
+  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   subtract_mb(x);
   vp9_fidct_mb(x);
   vp9_recon_mb(xd);
 }
 
 /* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x) {
+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
 
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
 
   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
 
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index f3c679227..6356df215 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -23,14 +23,14 @@ typedef struct {
 
 #include "vp9/encoder/vp9_onyx_int.h"
 struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(MACROBLOCK *x);
+void vp9_encode_inter16x16(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x);
 void vp9_transform_mby_4x4(MACROBLOCK *x);
 
 void vp9_optimize_mby_4x4(MACROBLOCK *x);
 void vp9_optimize_mbuv_4x4(MACROBLOCK *x);
-void vp9_encode_inter16x16y(MACROBLOCK *x);
+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
 
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a4924874d..4d0a299e8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -436,10 +436,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   int recon_yoffset, recon_uvoffset;
   YV12_BUFFER_CONFIG *lst_yv12 =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
   YV12_BUFFER_CONFIG *gld_yv12 =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
   int recon_y_stride = lst_yv12->y_stride;
   int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
@@ -613,7 +613,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x);
+          vp9_encode_inter16x16y(x, mb_row, mb_col);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -1663,8 +1663,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
   // variability limit (cpi->oxcf.two_pass_vbrmax_section)
-  if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+  if (cpi->twopass.gf_group_bits >
+      (int64_t)max_bits * cpi->baseline_gf_interval)
+    cpi->twopass.gf_group_bits = (int64_t)max_bits * cpi->baseline_gf_interval;
 
   // Reset the file position
   reset_fpf_position(cpi, start_pos);
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 218a47a8e..d6644c2aa 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -20,7 +20,9 @@
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               int_mv *ref_mv,
-                                              int_mv *dst_mv) {
+                                              int_mv *dst_mv,
+                                              int mb_row,
+                                              int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   BLOCK *b  = &x->block[0];
@@ -72,7 +74,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   }
 
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
   best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
                           xd->predictor, 16, INT_MAX);
 
@@ -93,8 +95,9 @@ static int do_16x16_motion_search
   YV12_BUFFER_CONFIG *buf,
   int buf_mb_y_offset,
   YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
+  int mb_y_offset,
+  int mb_row,
+  int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
@@ -124,7 +127,7 @@ static int do_16x16_motion_search
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err            = tmp_err;
     dst_mv->as_int = tmp_mv.as_int;
@@ -136,7 +139,8 @@ static int do_16x16_motion_search
     int_mv zero_ref_mv, tmp_mv;
 
     zero_ref_mv.as_int = 0;
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
+                                        mb_row, mb_col);
     if (tmp_err < err) {
       dst_mv->as_int = tmp_mv.as_int;
       err = tmp_err;
@@ -229,7 +233,9 @@ static void update_mbgraph_mb_stats
   int gld_y_offset,
   YV12_BUFFER_CONFIG *alt_ref,
   int_mv *prev_alt_ref_mv,
-  int arf_y_offset
+  int arf_y_offset,
+  int mb_row,
+  int mb_col
 ) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -249,7 +255,8 @@ static void update_mbgraph_mb_stats
     int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
                                                 &stats->ref[GOLDEN_FRAME].m.mv,
                                                 buf, mb_y_offset,
-                                                golden_ref, gld_y_offset);
+                                                golden_ref, gld_y_offset,
+                                                mb_row, mb_col);
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;
@@ -292,6 +299,9 @@ static void update_mbgraph_frame_stats
   int_mv arf_top_mv, gld_top_mv;
   MODE_INFO mi_local;
 
+  // Make sure the mi context starts in a consistent state.
+  memset(&mi_local, 0, sizeof(mi_local));
+
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
@@ -323,7 +333,8 @@ static void update_mbgraph_frame_stats
 
       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
                               golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset);
+                              alt_ref,    &arf_left_mv, arf_y_in_offset,
+                              mb_row, mb_col);
       arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
       gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
       if (mb_col == 0) {
@@ -434,7 +445,7 @@ void vp9_update_mbgraph_stats
   VP9_COMMON *const cm = &cpi->common;
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
   YV12_BUFFER_CONFIG *golden_ref =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
 
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index feb1e36c0..ced6eddca 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -10,6 +10,7 @@
 
 
 #include "vpx_config.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -832,7 +833,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   }
 
   {
-    int y_stride = cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].y_stride;
+    int y_stride = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].y_stride;
 
     if (cpi->sf.search_method == NSTEP) {
       vp9_init3smotion_compensation(&cpi->mb, y_stride);
@@ -1753,7 +1754,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 #endif
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *lst_yv12 =
-            &cpi->common.yv12_fb[cpi->common.active_ref_idx[cpi->lst_fb_idx]];
+            &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];
         double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
@@ -2098,11 +2099,11 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->lst_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->gld_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->alt_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2119,11 +2120,11 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->lst_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->gld_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->alt_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2198,6 +2199,69 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 }
 #endif
 
+static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
+                                   YV12_BUFFER_CONFIG *dst_fb) {
+  const int in_w = src_fb->y_width;
+  const int in_h = src_fb->y_height;
+  const int out_w = dst_fb->y_width;
+  const int out_h = dst_fb->y_height;
+  int x, y;
+
+  for (y = 0; y < out_h; y += 16) {
+    for (x = 0; x < out_w; x += 16) {
+      int x_q4 = x * 16 * in_w / out_w;
+      int y_q4 = y * 16 * in_h / out_h;
+      uint8_t *src, *dst;
+      int src_stride, dst_stride;
+
+
+      src = src_fb->y_buffer +
+          y * in_h / out_h * src_fb->y_stride +
+          x * in_w / out_w;
+      dst = dst_fb->y_buffer +
+          y * dst_fb->y_stride +
+          x;
+      src_stride = src_fb->y_stride;
+      dst_stride = dst_fb->y_stride;
+
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    16, 16);
+
+      x_q4 >>= 1;
+      y_q4 >>= 1;
+      src_stride = src_fb->uv_stride;
+      dst_stride = dst_fb->uv_stride;
+
+      src = src_fb->u_buffer +
+          y / 2 * in_h / out_h * src_fb->uv_stride +
+          x / 2 * in_w / out_w;
+      dst = dst_fb->u_buffer +
+          y / 2 * dst_fb->uv_stride +
+          x / 2;
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    8, 8);
+
+      src = src_fb->v_buffer +
+          y / 2 * in_h / out_h * src_fb->uv_stride +
+          x / 2 * in_w / out_w;
+      dst = dst_fb->v_buffer +
+          y / 2 * dst_fb->uv_stride +
+          x / 2;
+      vp9_convolve8(src, src_stride, dst, dst_stride,
+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                    8, 8);
+    }
+  }
+
+  vp8_yv12_extend_frame_borders(dst_fb);
+}
+
+
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
@@ -2416,9 +2480,9 @@ static void update_reference_frames(VP9_COMP * const cpi) {
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->gld_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
   } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
     /* Preserve the previously existing golden frame and update the frame in
      * the alt ref slot instead. This is highly specific to the current use of
@@ -2432,7 +2496,7 @@ static void update_reference_frames(VP9_COMP * const cpi) {
     int tmp;
 
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
@@ -2440,18 +2504,18 @@ static void update_reference_frames(VP9_COMP * const cpi) {
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
     }
 
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->active_ref_idx[cpi->gld_fb_idx], cm->new_fb_idx);
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     }
   }
 
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->lst_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
 }
 
@@ -2535,6 +2599,38 @@ static void select_interintra_mode(VP9_COMP *cpi) {
 }
 #endif
 
+static void scale_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+
+  for (i = 0; i < 3; i++) {
+    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
+
+    if (ref->y_width != cm->Width || ref->y_height != cm->Height) {
+      int new_fb = get_free_fb(cm);
+
+      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
+                                    cm->mb_cols * 16,
+                                    cm->mb_rows * 16,
+                                    VP9BORDERINPIXELS);
+      scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
+      cpi->scaled_ref_idx[i] = new_fb;
+    } else {
+      cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];
+      cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;
+    }
+  }
+}
+
+static void release_scaled_references(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i;
+
+  for (i = 0; i < 3; i++) {
+    cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
+  }
+}
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
                                       unsigned long *size,
                                       unsigned char *dest,
@@ -2583,6 +2679,17 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   int mcomp_filter_index = 0;
   int64_t mcomp_filter_cost[4];
 
+  /* Scale the source buffer, if required */
+  if (cm->Width != cpi->un_scaled_source->y_width ||
+      cm->Height != cpi->un_scaled_source->y_height) {
+    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+    cpi->Source = &cpi->scaled_source;
+  } else {
+    cpi->Source = cpi->un_scaled_source;
+  }
+
+  scale_references(cpi);
+
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
 
@@ -3231,6 +3338,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     update_reference_segmentation_map(cpi);
   }
 
+  release_scaled_references(cpi);
   update_reference_frames(cpi);
   vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);
   vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
@@ -3373,7 +3481,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     if (cpi->twopass.total_left_stats->coded_error != 0.0)
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+              "%6d %5d %5d %5d %8.2f %10d %10.3f"
               "%10.3f %8d %10d %10d %10d\n",
               cpi->common.current_video_frame, cpi->this_frame_target,
               cpi->projected_frame_size, 0, //loop_size_estimate,
@@ -3400,7 +3508,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     else
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-              "%5d %5d %8d %8.2f %10d %10.3f"
+              "%5d %5d %8d %8d %8.2f %10d %10.3f"
               "%8d %10d %10d %10d\n",
               cpi->common.current_video_frame,
               cpi->this_frame_target, cpi->projected_frame_size,
@@ -3516,6 +3624,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   xd->update_mb_segmentation_data = 0;
   xd->mode_ref_lf_delta_update = 0;
 
+  // keep track of the last coded dimensions
+  cm->last_width = cm->Width;
+  cm->last_height = cm->Height;
 
   // Dont increment frame counters if this was an altref buffer update not a real frame
   if (cm->show_frame) {
@@ -3533,8 +3644,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     FILE *recon_file;
     sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
     recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].buffer_alloc,
-           cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].frame_size,
+    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
+           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
            1, recon_file);
     fclose(recon_file);
   }
@@ -3756,28 +3867,16 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
   cm->new_fb_idx = get_free_fb(cm);
 
+  /* Get the mapping of L/G/A to the reference buffer pool */
+  cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];
+  cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];
+  cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];
+
   /* Reset the frame pointers to the current frame size */
   vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
                                 cm->mb_cols * 16, cm->mb_rows * 16,
                                 VP9BORDERINPIXELS);
 
-  /* Disable any references that have different size */
-  if ((cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].y_width !=
-       cm->yv12_fb[cm->new_fb_idx].y_width) ||
-      (cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].y_height !=
-       cm->yv12_fb[cm->new_fb_idx].y_height))
-    cpi->ref_frame_flags &= ~VP9_LAST_FLAG;
-  if ((cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]].y_width !=
-       cm->yv12_fb[cm->new_fb_idx].y_width) ||
-      (cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]].y_height !=
-       cm->yv12_fb[cm->new_fb_idx].y_height))
-    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-  if ((cm->yv12_fb[cm->active_ref_idx[cpi->alt_fb_idx]].y_width !=
-       cm->yv12_fb[cm->new_fb_idx].y_width) ||
-      (cm->yv12_fb[cm->active_ref_idx[cpi->alt_fb_idx]].y_height !=
-       cm->yv12_fb[cm->new_fb_idx].y_height))
-    cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
   if (cpi->pass == 1) {
     Pass1Encode(cpi, size, dest, frame_flags);
@@ -4027,18 +4126,31 @@ int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
 int vp9_set_internal_size(VP9_PTR comp,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
+  VP9_COMMON *cm = &cpi->common;
 
-  if (horiz_mode <= ONETWO)
-    cpi->horiz_scale = horiz_mode;
-  else
+  if (horiz_mode > ONETWO)
     return -1;
 
-  if (vert_mode <= ONETWO)
-    cpi->vert_scale = vert_mode;
-  else
+  if (vert_mode > ONETWO)
     return -1;
 
-  vp9_change_config(comp, &cpi->oxcf);
+  if (cm->horiz_scale != horiz_mode || cm->vert_scale != vert_mode) {
+    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+    cm->horiz_scale = horiz_mode;
+    cm->vert_scale = vert_mode;
+
+    Scale2Ratio(cm->horiz_scale, &hr, &hs);
+    Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+    // always go to the next whole number
+    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+  }
+  assert(cm->Width <= cpi->initial_width);
+  assert(cm->Height <= cpi->initial_height);
+  update_frame_size(cpi);
   return 0;
 }
 
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9b509ea0b..02a371964 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -332,6 +332,7 @@ typedef struct VP9_COMP {
   int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
   int gold_is_alt;  // don't do both alt and gold search ( just do gold).
 
+  int scaled_ref_idx[3];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index be091eee2..4f843005a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -402,6 +402,10 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+  ENTROPY_CONTEXT *const a1 = a +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+  ENTROPY_CONTEXT *const l1 = l +
+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
 
   switch (tx_size) {
     case TX_4X4:
@@ -416,6 +420,8 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       }
       break;
     case TX_8X8:
+      a_ec = (a[0] + a[1]) != 0;
+      l_ec = (l[0] + l[1]) != 0;
       scan = vp9_default_zig_zag1d_8x8;
       seg_eob = 64;
       break;
@@ -425,12 +431,21 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
       if (type == PLANE_TYPE_UV) {
         const int uv_idx = ib - 16;
         qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;
+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
       }
       break;
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       seg_eob = 1024;
       qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      a_ec = (a[0] + a[1] + a[2] + a[3] +
+              a1[0] + a1[1] + a1[2] + a1[3]) != 0;
+      l_ec = (l[0] + l[1] + l[2] + l[3] +
+              l1[0] + l1[1] + l1[2] + l1[3]) != 0;
       break;
     default:
       abort();
@@ -459,6 +474,20 @@ static INLINE int cost_coeffs(MACROBLOCK *mb,
   // is eob first coefficient;
   pt = (c > 0);
   *a = *l = pt;
+  if (tx_size >= TX_8X8) {
+    a[1] = l[1] = pt;
+    if (tx_size >= TX_16X16) {
+      if (type == PLANE_TYPE_UV) {
+        a1[0] = a1[1] = l1[0] = l1[1] = pt;
+      } else {
+        a[2] = a[3] = l[2] = l[3] = pt;
+        if (tx_size >= TX_32X32) {
+          a1[0] = a1[1] = a1[2] = a1[3] = pt;
+          l1[0] = l1[1] = l1[2] = l1[3] = pt;
+        }
+      }
+    }
+  }
   return cost;
 }
 
@@ -701,15 +730,15 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
 
 static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
   MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
     ta = (ENTROPY_CONTEXT *) &t_above,
     tl = (ENTROPY_CONTEXT *) &t_left;
 
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES) * 2);
   } else {
     ta = (ENTROPY_CONTEXT *) xd->above_context;
     tl = (ENTROPY_CONTEXT *) xd->left_context;
@@ -1013,7 +1042,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
     b->bmi.as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, b);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
       vp9_ht_quantize_b_4x4(be, b, tx_type);
     } else {
       x->fwd_txm4x4(be->src_diff, be->coeff, 32);
@@ -1046,7 +1075,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, b->diff, 32, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);
   else
     xd->inv_txm4x4(best_dqcoeff, b->diff, 32);
 
@@ -1279,8 +1308,9 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   int distortion = 0, rate = 0;
   BLOCK  *be = x->block + ib;
   BLOCKD *b = xd->block + ib;
-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;
-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;
+  ENTROPY_CONTEXT_PLANES ta, tl;
+  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;
+  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;
 
   /*
    * The predictor buffer is a 2d buffer with a stride of 16.  Create
@@ -1309,7 +1339,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
       TX_TYPE tx_type = get_tx_type_8x8(xd, b);
       if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 32, tx_type);
+        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
       else
         x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
@@ -1317,23 +1347,29 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       // compute quantization mse of 8x8 block
       distortion = vp9_block_error_c((x->block + idx)->coeff,
                                      (xd->block + idx)->dqcoeff, 64);
-      ta0 = a[vp9_block2above[TX_8X8][idx]];
-      tl0 = l[vp9_block2left[TX_8X8][idx]];
+
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
 
       rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
-                           &ta0, &tl0, TX_8X8);
+                           ta0, tl0, TX_8X8);
 
       rate += rate_t;
-      ta1 = ta0;
-      tl1 = tl0;
     } else {
       static const int iblock[4] = {0, 1, 4, 5};
       TX_TYPE tx_type;
       int i;
-      ta0 = a[vp9_block2above[TX_4X4][ib]];
-      ta1 = a[vp9_block2above[TX_4X4][ib + 1]];
-      tl0 = l[vp9_block2left[TX_4X4][ib]];
-      tl1 = l[vp9_block2left[TX_4X4][ib + 4]];
+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];
+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];
+      ta1 = ta0 + 1;
+      tl1 = tl0 + 1;
       distortion = 0;
       rate_t = 0;
       for (i = 0; i < 4; ++i) {
@@ -1342,7 +1378,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         be = &x->block[ib + iblock[i]];
         tx_type = get_tx_type_4x4(xd, b);
         if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(be->src_diff, be->coeff, 32, tx_type);
+          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
           vp9_ht_quantize_b_4x4(be, b, tx_type);
         } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {
           x->fwd_txm8x4(be->src_diff, be->coeff, 32);
@@ -1354,15 +1390,13 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         }
         distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
         rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
-                              // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                              &ta0, &tl0,
+                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                               TX_4X4);
         if (do_two) {
+          i++;
           rate_t += cost_coeffs(x, b + 1, PLANE_TYPE_Y_WITH_DC,
-                                // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                                &ta0, &tl0,
+                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
                                 TX_4X4);
-          i++;
         }
       }
       b = &xd->block[ib];
@@ -1376,10 +1410,10 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       *bestrate = rate;
       *bestratey = rate_t;
       *bestdistortion = distortion;
-      besta0 = ta0;
-      besta1 = ta1;
-      bestl0 = tl0;
-      bestl1 = tl1;
+      besta0 = *ta0;
+      besta1 = *ta1;
+      bestl0 = *tl0;
+      bestl1 = *tl1;
       best_rd = this_rd;
       *best_mode = mode;
       copy_predictor_8x8(best_predictor, b->predictor);
@@ -1532,12 +1566,12 @@ static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
   ENTROPY_CONTEXT *ta, *tl;
 
   if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
 
     ta = (ENTROPY_CONTEXT *) &t_above;
     tl = (ENTROPY_CONTEXT *) &t_left;
@@ -1637,8 +1671,9 @@ static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 }
 
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                              int *distortion, int *skip, int fullpixel) {
-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);
+                              int *distortion, int *skip, int fullpixel,
+                              int mb_row, int mb_col) {
+  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     x->e_mbd.predictor, x->src.uv_stride);
   return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);
@@ -2115,9 +2150,22 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x,
       BLOCK *be = &x->block[i];
       int thisdistortion;
 
-      vp9_build_inter_predictors_b(bd, 16, &xd->subpix);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix);
+      vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,
+                                bd->pre_stride,
+                                bd->predictor, 16,
+                                &bd->bmi.as_mv[0],
+                                &xd->scale_factor[0],
+                                4, 4, 0 /* no avg */, &xd->subpix);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        vp9_build_inter_predictor(*(bd->base_second_pre) + bd->pre,
+                                  bd->pre_stride,
+                                  bd->predictor, 16,
+                                  &bd->bmi.as_mv[1],
+                                  &xd->scale_factor[1],
+                                  4, 4, 1 /* avg */, &xd->subpix);
+      }
+
       vp9_subtract_b(be, bd, 16);
       x->fwd_txm4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, bd);
@@ -2159,14 +2207,25 @@ static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,
     int ib = vp9_i8x8_block[i];
 
     if (labels[ib] == which_label) {
+      const int use_second_ref =
+          xd->mode_info_context->mbmi.second_ref_frame > 0;
+      int which_mv;
       int idx = (ib & 8) + ((ib & 2) << 1);
       BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
       BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
       int thisdistortion;
 
-      vp9_build_inter_predictors4b(xd, bd, 16);
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors4b(xd, bd, 16);
+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+        uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;
+
+        vp9_build_inter_predictor(*base_pre + bd->pre,
+                                  bd->pre_stride,
+                                  bd->predictor, 16,
+                                  &bd->bmi.as_mv[which_mv],
+                                  &xd->scale_factor[which_mv],
+                                  8, 8, which_mv, &xd->subpix);
+      }
+
       vp9_subtract_4b_c(be, bd, 16);
 
       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
@@ -3050,26 +3109,45 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
                                int frame_mdcounts[4][4],
-                               YV12_BUFFER_CONFIG yv12_mb[4]) {
-  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
+                               YV12_BUFFER_CONFIG yv12_mb[4],
+                               struct scale_factors scale[MAX_REF_FRAMES]) {
+  VP9_COMMON *cm = &cpi->common;
+  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;
 
-  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col);
+  // set up scaling factors
+  scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
+  scale[frame_type].x_offset_q4 =
+      (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf;
+  scale[frame_type].y_offset_q4 =
+      (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf;
+
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col,
+                   &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
+  use_prev_in_find_mv_refs = cm->Width == cm->last_width &&
+                             cm->Height == cm->last_height &&
+                             !cpi->common.error_resilient_mode;
   vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
-                   cpi->common.error_resilient_mode ?
-                   0 : xd->prev_mode_info_context,
+                   use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL,
                    frame_type,
                    mbmi->ref_mvs[frame_type],
                    cpi->common.ref_frame_sign_bias);
 
   // Candidate refinement carried out at encoder and decoder
+  use_prev_in_find_best_ref =
+      scale[frame_type].x_num == scale[frame_type].x_den &&
+      scale[frame_type].y_num == scale[frame_type].y_den &&
+      !cm->error_resilient_mode &&
+      !cm->frame_parallel_decoding_mode;
   vp9_find_best_ref_mvs(xd,
-                        cpi->common.error_resilient_mode ||
-                        cpi->common.frame_parallel_decoding_mode ?
-                        0 : yv12_mb[frame_type].y_buffer,
+                        use_prev_in_find_best_ref ?
+                            yv12_mb[frame_type].y_buffer : NULL,
                         yv12->y_stride,
                         mbmi->ref_mvs[frame_type],
                         &frame_nearest_mv[frame_type],
@@ -3140,7 +3218,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int mode_index,
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
-                                                [MAX_REF_FRAMES]) {
+                                                [MAX_REF_FRAMES],
+                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
+                                 int mb_row, int mb_col) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -3184,6 +3264,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
       } else {
+        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;
         int bestsme = INT_MAX;
         int further_steps, step_param = cpi->sf.first_step;
         int sadpb = x->sadperbit16;
@@ -3195,6 +3276,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
+        if (scaled_ref_frame) {
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          xd->pre = *scaled_ref_frame;
+          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;
+          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+        }
+
         vp9_clamp_mv_min_max(x, &ref_mv[0]);
 
         // mvp_full.as_int = ref_mv[0].as_int;
@@ -3237,6 +3328,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
                                   x->nmvjointcost, x->mvcost,
                                   96, xd->allow_high_precision_mv);
+
+        // restore the predictor, if required
+        if (scaled_ref_frame) {
+          xd->pre = backup_yv12;
+        }
       }
       break;
     case NEARMV:
@@ -3318,7 +3414,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                            xd->dst.u_buffer,
                                            xd->dst.v_buffer,
                                            xd->dst.y_stride,
-                                           xd->dst.uv_stride);
+                                           xd->dst.uv_stride,
+                                           mb_row, mb_col);
         var = vp9_variance64x64(*(b->base_src), b->src_stride,
                                 xd->dst.y_buffer, xd->dst.y_stride, &sse);
         // Note our transform coeffs are 8 times an orthogonal transform.
@@ -3402,7 +3499,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                            xd->dst.u_buffer,
                                            xd->dst.v_buffer,
                                            xd->dst.y_stride,
-                                           xd->dst.uv_stride);
+                                           xd->dst.uv_stride,
+                                           mb_row, mb_col);
         var = vp9_variance32x32(*(b->base_src), b->src_stride,
                                 xd->dst.y_buffer, xd->dst.y_stride, &sse);
         // Note our transform coeffs are 8 times an orthogonal transform.
@@ -3482,19 +3580,21 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         unsigned int sse, var;
         int tmp_rate_y, tmp_rate_u, tmp_rate_v;
         int tmp_dist_y, tmp_dist_u, tmp_dist_v;
-        vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-        if (is_comp_pred)
-          vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
+        // TODO(jkoleszar): these 2 y/uv should be replaced with one call to
+        // vp9_build_interintra_16x16_predictors_mb().
+        vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16,
+                                            mb_row, mb_col);
+
 #if CONFIG_COMP_INTERINTRA_PRED
         if (is_comp_interintra_pred) {
           vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
         }
 #endif
-        vp9_build_1st_inter16x16_predictors_mbuv(xd, xd->predictor + 256,
-                                                 xd->predictor + 320, 8);
-        if (is_comp_pred)
-          vp9_build_2nd_inter16x16_predictors_mbuv(xd, xd->predictor + 256,
-                                                   xd->predictor + 320, 8);
+
+        vp9_build_inter16x16_predictors_mbuv(xd, xd->predictor + 256,
+                                             xd->predictor + 320, 8,
+                                             mb_row, mb_col);
+
 #if CONFIG_COMP_INTERINTRA_PRED
         if (is_comp_interintra_pred) {
           vp9_build_interintra_16x16_predictors_mbuv(xd, xd->predictor + 256,
@@ -3589,28 +3689,29 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
     } else if (block_size == BLOCK_32X32) {
       vp9_build_inter32x32_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
                                          xd->dst.v_buffer,
                                          xd->dst.y_stride,
-                                         xd->dst.uv_stride);
+                                         xd->dst.uv_stride,
+                                         mb_row, mb_col);
     } else {
-      vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
+      // TODO(jkoleszar): These y/uv fns can be replaced with their mb
+      // equivalent
+      vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16,
+                                          mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
       if (is_comp_interintra_pred) {
         vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
       }
 #endif
-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                 &xd->predictor[320], 8);
+      vp9_build_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           mb_row, mb_col);
 #if CONFIG_COMP_INTERINTRA_PRED
       if (is_comp_interintra_pred) {
         vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
@@ -3805,6 +3906,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y1dc_delta_q);
 
+  struct scale_factors scale_factor[4];
+
   vpx_memset(mode8x8, 0, sizeof(mode8x8));
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -3828,24 +3931,24 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.active_ref_idx[cpi->lst_fb_idx],
+    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
                        LAST_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.active_ref_idx[cpi->gld_fb_idx],
+    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
                        GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->common.active_ref_idx[cpi->alt_fb_idx],
+    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
                        ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col,
                        frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb);
+                       frame_mdcounts, yv12_mb, scale_factor);
   }
 
   *returnintra = INT64_MAX;
@@ -3884,6 +3987,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
     int mode_excluded = 0;
     int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
+    YV12_BUFFER_CONFIG *scaled_ref_frame;
 
     // These variables hold are rolling total cost and distortion for this mode
     rate2 = 0;
@@ -3900,6 +4004,10 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
     mbmi->interp_filter = cm->mcomp_filter_type;
+
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
+
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
     // Test best rd so far against threshold for trying this mode.
@@ -3915,6 +4023,18 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
       continue;
 
+    // only scale on zeromv.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode != ZEROMV)
+      continue;
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode != ZEROMV)
+      continue;
+
     // current coding mode under rate-distortion optimization test loop
 #if CONFIG_COMP_INTERINTRA_PRED
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
@@ -3947,12 +4067,25 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* everything but intra */
+    scaled_ref_frame = NULL;
     if (mbmi->ref_frame) {
       int ref = mbmi->ref_frame;
+      int fb;
 
       xd->pre = yv12_mb[ref];
       best_ref_mv = mbmi->ref_mvs[ref][0];
       vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
+
+      if (mbmi->ref_frame == LAST_FRAME) {
+        fb = cpi->lst_fb_idx;
+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
+        fb = cpi->gld_fb_idx;
+      } else {
+        fb = cpi->alt_fb_idx;
+      }
+
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
     }
 
     if (mbmi->second_ref_frame > 0) {
@@ -4233,7 +4366,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int uv_skippable;
 
         rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                       cpi->common.full_pixel);
+                       cpi->common.full_pixel, mb_row, mb_col);
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
@@ -4275,7 +4408,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                   &rate_y, &distortion,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv);
+                                  mode_index, &tmp_best_filter, frame_mv,
+                                  scaled_ref_frame, mb_row, mb_col);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -4526,6 +4660,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mb_skip_coeff =
       (cpi->common.mb_no_coeff_skip) ? 1 : 0;
     mbmi->partitioning = 0;
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
 
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
@@ -4578,6 +4714,8 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
 end:
+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                    scale_factor);
   store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
                        best_mode_index, &best_partition,
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
@@ -4791,9 +4929,9 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
-                     cpi->common.active_ref_idx[cpi->lst_fb_idx],
-                     cpi->common.active_ref_idx[cpi->gld_fb_idx],
-                     cpi->common.active_ref_idx[cpi->alt_fb_idx]};
+                     cpi->lst_fb_idx,
+                     cpi->gld_fb_idx,
+                     cpi->alt_fb_idx};
   int mdcounts[4];
   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
   int saddone = 0;
@@ -4820,6 +4958,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
   int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
   MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
+  struct scale_factors scale_factor[4];
 
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
@@ -4835,7 +4974,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          mb_row, mb_col, frame_mv[NEARESTMV],
                          frame_mv[NEARMV], frame_mdcounts,
-                         yv12_mb);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -4914,6 +5053,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                      scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
@@ -4921,6 +5062,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
     mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
+
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
     mbmi->interp_filter = cm->mcomp_filter_type;
@@ -4945,6 +5087,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (!(cpi->ref_frame_flags & flag_list[second_ref]))
         continue;
       mbmi->second_ref_frame = second_ref;
+      set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                        scale_factor);
 
       xd->second_pre = yv12_mb[second_ref];
       mode_excluded =
@@ -5022,6 +5166,20 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
       distortion2 = distortion_y + distortion_uv;
     } else {
+      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      int fb;
+
+      if (mbmi->ref_frame == LAST_FRAME) {
+        fb = cpi->lst_fb_idx;
+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
+        fb = cpi->gld_fb_idx;
+      } else {
+        fb = cpi->alt_fb_idx;
+      }
+
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
 #if CONFIG_COMP_INTERINTRA_PRED
       if (mbmi->second_ref_frame == INTRA_FRAME) {
         if (best_intra16_mode == DC_PRED - 1) continue;
@@ -5043,7 +5201,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv);
+                                  mode_index, &tmp_best_filter, frame_mv,
+                                  scaled_ref_frame, mb_row, mb_col);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -5296,6 +5455,8 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
  end:
+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+                    scale_factor);
   {
     PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?
                             &x->sb32_context[xd->sb_index] :
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 710ae58fe..01b156044 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -45,18 +45,4 @@ extern void vp9_init_me_luts();
 extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
                                    MB_PREDICTION_MODE mb, int_mv *mv);
 
-static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mb_row, int mb_col) {
-  const int recon_y_stride = src->y_stride;
-  const int recon_uv_stride = src->uv_stride;
-  const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
-  const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
-
-  *dst = *src;
-  dst->y_buffer += recon_yoffset;
-  dst->u_buffer += recon_uvoffset;
-  dst->v_buffer += recon_uvoffset;
-}
-
 #endif  // VP9_ENCODER_VP9_RDOPT_H_
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index d016e52cc..a6cd1c0c3 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -12,6 +12,7 @@
 #include <limits.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -42,40 +43,35 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int mv_row,
                                             int mv_col,
                                             uint8_t *pred) {
-  int offset;
-  uint8_t *yptr, *uptr, *vptr;
-  int omv_row, omv_col;
-
-  // Y
-  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
-
-  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0](
-      yptr, stride, &pred[0], 16,
-      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
-      16, 16);
-
-  // U & V
-  omv_row = mv_row;
-  omv_col = mv_col;
-  mv_row >>= 1;
-  mv_col >>= 1;
+  const int which_mv = 0;
+  int_mv subpel_mv;
+  int_mv fullpel_mv;
+
+  subpel_mv.as_mv.row = mv_row;
+  subpel_mv.as_mv.col = mv_col;
+  // TODO(jkoleszar): Make this rounding consistent with the rest of the code
+  fullpel_mv.as_mv.row = (mv_row >> 1) & ~7;
+  fullpel_mv.as_mv.col = (mv_col >> 1) & ~7;
+
+  vp9_build_inter_predictor(y_mb_ptr, stride,
+                            &pred[0], 16,
+                            &subpel_mv,
+                            &xd->scale_factor[which_mv],
+                            16, 16, which_mv, &xd->subpix);
+
   stride = (stride + 1) >> 1;
-  offset = (mv_row >> 3) * stride + (mv_col >> 3);
-  uptr = u_mb_ptr + offset;
-  vptr = v_mb_ptr + offset;
-
-  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
-      uptr, stride, &pred[256], 8,
-      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
-      8, 8);
-
-  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](
-      vptr, stride, &pred[320], 8,
-      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,
-      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,
-      8, 8);
+
+  vp9_build_inter_predictor_q4(u_mb_ptr, stride,
+                               &pred[256], 8,
+                               &fullpel_mv, &subpel_mv,
+                               &xd->scale_factor_uv[which_mv],
+                               8, 8, which_mv, &xd->subpix);
+
+  vp9_build_inter_predictor_q4(v_mb_ptr, stride,
+                               &pred[320], 8,
+                               &fullpel_mv, &subpel_mv,
+                               &xd->scale_factor_uv[which_mv],
+                               8, 8, which_mv, &xd->subpix);
 }
 
 void vp9_temporal_filter_apply_c(uint8_t *frame1,
@@ -460,6 +456,13 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
 , start_frame);
 #endif
 
+  // Setup scaling factors. Scaling on each of the arnr frames is not supported
+  vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
+      &cpi->common.yv12_fb[cpi->common.new_fb_idx],
+      16 * cpi->common.mb_cols,
+      16 * cpi->common.mb_rows);
+  cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
+
   // Setup frame pointers, NULL indicates frame not included in filter
   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
   for (frame = 0; frame < frames_to_blur; frame++) {
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 8efc97697..1b07359da 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -145,17 +145,14 @@ static void tokenize_b(VP9_COMP *cpi,
       probs = cpi->common.fc.coef_probs_4x4;
       break;
     case TX_8X8:
-#if CONFIG_CNVCONTEXT
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
-#endif
       seg_eob = 64;
       scan = vp9_default_zig_zag1d_8x8;
       counts = cpi->coef_counts_8x8;
       probs = cpi->common.fc.coef_probs_8x8;
       break;
     case TX_16X16:
-#if CONFIG_CNVCONTEXT
       if (type != PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
@@ -163,7 +160,6 @@ static void tokenize_b(VP9_COMP *cpi,
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
-#endif
       seg_eob = 256;
       scan = vp9_default_zig_zag1d_16x16;
       counts = cpi->coef_counts_16x16;
@@ -174,14 +170,12 @@ static void tokenize_b(VP9_COMP *cpi,
       }
       break;
     case TX_32X32:
-#if CONFIG_CNVCONTEXT
       a_ec = a[0] + a[1] + a[2] + a[3] +
              a1[0] + a1[1] + a1[2] + a1[3];
       l_ec = l[0] + l[1] + l[2] + l[3] +
              l1[0] + l1[1] + l1[2] + l1[3];
       a_ec = a_ec != 0;
       l_ec = l_ec != 0;
-#endif
       seg_eob = 1024;
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
@@ -635,15 +629,12 @@ static INLINE void stuff_b(VP9_COMP *cpi,
       probs = cpi->common.fc.coef_probs_4x4;
       break;
     case TX_8X8:
-#if CONFIG_CNVCONTEXT
       a_ec = (a[0] + a[1]) != 0;
       l_ec = (l[0] + l[1]) != 0;
-#endif
       counts = cpi->coef_counts_8x8;
       probs = cpi->common.fc.coef_probs_8x8;
       break;
     case TX_16X16:
-#if CONFIG_CNVCONTEXT
       if (type != PLANE_TYPE_UV) {
         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
@@ -651,19 +642,16 @@ static INLINE void stuff_b(VP9_COMP *cpi,
         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
       }
-#endif
       counts = cpi->coef_counts_16x16;
       probs = cpi->common.fc.coef_probs_16x16;
       break;
     case TX_32X32:
-#if CONFIG_CNVCONTEXT
       a_ec = a[0] + a[1] + a[2] + a[3] +
              a1[0] + a1[1] + a1[2] + a1[3];
       l_ec = l[0] + l[1] + l[2] + l[3] +
              l1[0] + l1[1] + l1[2] + l1[3];
       a_ec = a_ec != 0;
       l_ec = l_ec != 0;
-#endif
       counts = cpi->coef_counts_32x32;
       probs = cpi->common.fc.coef_probs_32x32;
       break;