40 files changed, 475 insertions, 472 deletions
diff --git a/configure b/configure
index 5c8dc8e84..35801e200 100755
--- a/configure
+++ b/configure
@@ -246,6 +246,7 @@ EXPERIMENT_LIST="
     enable_6tap
     abovesprefmv
     code_nonzerocount
+    useselectrefmv
 "
 CONFIG_LIST="
     external_build
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index f7709ecd4..dfb64c3a2 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -120,7 +120,7 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
     }
 
     // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4llm_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
 
     for (int j = 0; j < 16; ++j) {
       const int diff = test_input_block[j] - test_output_block[j];
diff --git a/test/idctllm_test.cc b/test/idct_test.cc
index d6fdffea5..51fb65a43 100644
--- a/test/idctllm_test.cc
+++ b/test/idct_test.cc
@@ -10,8 +10,8 @@
 
 
 extern "C" {
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 }
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -20,18 +20,16 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,
                           int pred_stride, unsigned char *dst_ptr,
                           int dst_stride);
 namespace {
-class IDCTTest : public ::testing::TestWithParam<idct_fn_t>
-{
+class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
   protected:
-    virtual void SetUp()
-    {
+    virtual void SetUp() {
         int i;
 
         UUT = GetParam();
         memset(input, 0, sizeof(input));
         /* Set up guard blocks */
-        for(i=0; i<256; i++)
-            output[i] = ((i&0xF)<4&&(i<64))?0:-1;
+        for (i = 0; i < 256; i++)
+            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;
     }
 
     idct_fn_t UUT;
@@ -40,78 +38,72 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t>
     unsigned char predict[256];
 };
 
-TEST_P(IDCTTest, TestGuardBlocks)
-{
+TEST_P(IDCTTest, TestGuardBlocks) {
     int i;
 
-    for(i=0; i<256; i++)
-        if((i&0xF) < 4 && i<64)
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
             EXPECT_EQ(0, output[i]) << i;
         else
             EXPECT_EQ(255, output[i]);
 }
 
-TEST_P(IDCTTest, TestAllZeros)
-{
+TEST_P(IDCTTest, TestAllZeros) {
     int i;
 
     REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
-    for(i=0; i<256; i++)
-        if((i&0xF) < 4 && i<64)
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
             EXPECT_EQ(0, output[i]) << "i==" << i;
         else
             EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
-TEST_P(IDCTTest, TestAllOnes)
-{
+TEST_P(IDCTTest, TestAllOnes) {
     int i;
 
     input[0] = 4;
     REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
-    for(i=0; i<256; i++)
-        if((i&0xF) < 4 && i<64)
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
             EXPECT_EQ(1, output[i]) << "i==" << i;
         else
             EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
-TEST_P(IDCTTest, TestAddOne)
-{
+TEST_P(IDCTTest, TestAddOne) {
     int i;
 
-    for(i=0; i<256; i++)
+    for (i = 0; i < 256; i++)
         predict[i] = i;
-
     input[0] = 4;
     REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));
 
-    for(i=0; i<256; i++)
-        if((i&0xF) < 4 && i<64)
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) < 4 && i < 64)
             EXPECT_EQ(i+1, output[i]) << "i==" << i;
         else
             EXPECT_EQ(255, output[i]) << "i==" << i;
 }
 
-TEST_P(IDCTTest, TestWithData)
-{
+TEST_P(IDCTTest, TestWithData) {
     int i;
 
-    for(i=0; i<16; i++)
+    for (i = 0; i < 16; i++)
         input[i] = i;
 
     REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));
 
-    for(i=0; i<256; i++)
-        if((i&0xF) > 3 || i>63)
+    for (i = 0; i < 256; i++)
+        if ((i & 0xF) > 3 || i > 63)
             EXPECT_EQ(255, output[i]) << "i==" << i;
-        else if(i == 0)
+        else if (i == 0)
             EXPECT_EQ(11, output[i]) << "i==" << i;
-        else if(i == 34)
+        else if (i == 34)
             EXPECT_EQ(1, output[i]) << "i==" << i;
-        else if(i == 2 || i == 17 || i == 32)
+        else if (i == 2 || i == 17 || i == 32)
             EXPECT_EQ(3, output[i]) << "i==" << i;
         else
             EXPECT_EQ(0, output[i]) << "i==" << i;
diff --git a/test/test.mk b/test/test.mk
index 37e4ee793..793fbf8b2 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -47,7 +47,7 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 endif
 
-LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc
+LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
diff --git a/vp9/common/ppc/vp9_idctllm_altivec.asm b/vp9/common/ppc/vp9_idct_altivec.asm
index 117d9cfc8..b87aa4200 100644
--- a/vp9/common/ppc/vp9_idctllm_altivec.asm
+++ b/vp9/common/ppc/vp9_idct_altivec.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    .globl short_idct4x4llm_ppc
+    .globl short_idct4x4_ppc
 
 .macro load_c V, LABEL, OFF, R0, R1
     lis     \R0, \LABEL@ha
@@ -21,7 +21,7 @@
 ;# r4 short *output
 ;# r5 int pitch
     .align 2
-short_idct4x4llm_ppc:
+short_idct4x4_ppc:
     mfspr   r11, 256            ;# get old VRSAVE
     oris    r12, r11, 0xfff8
     mtspr   256, r12            ;# set VRSAVE
diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c
index 02035191f..ac13722d4 100644
--- a/vp9/common/ppc/vp9_systemdependent.c
+++ b/vp9/common/ppc/vp9_systemdependent.c
@@ -63,7 +63,7 @@ void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_pt
 void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
 void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
 
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+extern void short_idct4x4_ppc(short *input, short *output, int pitch);
 
 // Generic C
 extern subpixel_predict_function vp9_sixtap_predict_c;
@@ -83,8 +83,8 @@ void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_
 void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
 void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
 
-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);
+extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);
 extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
 
 // PPC
@@ -139,8 +139,8 @@ void vp9_machine_specific_config(void) {
   vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;
   vp9_sixtap_predict                   = sixtap_predict_ppc;
 
-  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;
-  vp8_short_idct4x4                    = short_idct4x4llm_ppc;
+  vp8_short_idct4x4_1                  = vp9_short_idct4x4_1_c;
+  vp8_short_idct4x4                    = short_idct4x4_ppc;
   vp8_dc_only_idct                      = vp8_dc_only_idct_c;
 
   vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index c3d6dae93..15c8c0d64 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -67,16 +67,13 @@ void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
 
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
   int i;
+  int aligned_width, aligned_height;
 
   vp9_de_alloc_frame_buffers(oci);
 
   /* our internal buffers are always multiples of 16 */
-  if ((width & 0xf) != 0)
-    width += 16 - (width & 0xf);
-
-  if ((height & 0xf) != 0)
-    height += 16 - (height & 0xf);
-
+  aligned_width = (width + 15) & ~15;
+  aligned_height = (height + 15) & ~15;
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
     oci->fb_idx_ref_cnt[i] = 0;
@@ -110,8 +107,8 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
     return 1;
   }
 
-  oci->mb_rows = height >> 4;
-  oci->mb_cols = width >> 4;
+  oci->mb_rows = aligned_height >> 4;
+  oci->mb_cols = aligned_width >> 4;
   oci->MBs = oci->mb_rows * oci->mb_cols;
   oci->mode_info_stride = oci->mb_cols + 1;
   oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 8409885a0..f6d6932cc 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -118,10 +118,12 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
   return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
 }
 
+#if CONFIG_USESELECTREFMV
 /* check a list of motion vectors by sad score using a number rows of pixels
  * above and a number cols of pixels in the left to select the one with best
  * score to use as ref motion vector
  */
+
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            uint8_t *ref_y_buffer,
                            int ref_y_stride,
@@ -298,3 +300,20 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
   // Copy back the re-ordered mv list
   vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
 }
+#else
+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
+                           uint8_t *ref_y_buffer,
+                           int ref_y_stride,
+                           int_mv *mvlist,
+                           int_mv *nearest,
+                           int_mv *near) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
+    clamp_mv2(&mvlist[i], xd);
+  }
+  *nearest = mvlist[0];
+  *near = mvlist[1];
+}
+#endif
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idct.c
index e2106250f..3ec093f73 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idct.c
@@ -8,20 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-/****************************************************************************
- * Notes:
- *
- * This implementation makes use of 16 bit fixed point verio of two multiply
- * constants:
- *         1.   sqrt(2) * cos (pi/8)
- *         2.   sqrt(2) * sin (pi/8)
- * Becuase the first constant is bigger than 1, to maintain the same 16 bit
- * fixed point precision as the second one, we use a trick of
- *         x * a = x + x*(a-1)
- * so
- *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
- **************************************************************************/
 #include <assert.h>
 #include <math.h>
 
@@ -32,7 +18,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
   int16_t *ip = input;
@@ -73,7 +59,7 @@ void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
@@ -99,7 +85,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
   int r, c;
   int16_t dc = input_dc;
   int16_t tmp[4 * 4];
-  vp9_short_inv_walsh4x4_1_x8_c(&dc, tmp, 4 << 1);
+  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++)
@@ -130,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
   const int half_pitch = pitch >> 1;
@@ -156,7 +142,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1;
   int16_t *op = output;
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 79d060945..55bcccb0e 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -211,8 +211,10 @@ extern "C"
 
   int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
 
-  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                            YV12_BUFFER_CONFIG *sd);
+  int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
+                             YV12_BUFFER_CONFIG *sd);
+
+  int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
 
   int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
                             YV12_BUFFER_CONFIG *sd);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index a0867ae7f..f98ec442d 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,8 +20,8 @@
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
                                        YV12_BUFFER_CONFIG *other,
                                        int this_w, int this_h) {
-  int other_h = other->y_height;
-  int other_w = other->y_width;
+  int other_h = other->y_crop_height;
+  int other_w = other->y_crop_width;
 
   scale->x_num = other_w;
   scale->x_den = this_w;
@@ -95,7 +95,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
 
     vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
                                       &cm->yv12_fb[cm->active_ref_idx[i]],
-                                      cm->mb_cols * 16, cm->mb_rows * 16);
+                                      cm->Width, cm->Height);
   }
 
   if (xd->mode_info_context) {
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index 3031fb699..b97b6089d 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -17,9 +17,10 @@
 void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
 
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n);
+                                              int stride, int n,
+                                              int tx, int ty);
 
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);
 
 #if CONFIG_COMP_INTERINTRA_PRED
 void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index 7fbee7c32..eab5ab495 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -15,17 +15,17 @@
 #include "vp9_rtcd.h"
 
 #if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(uint8_t *x, int stride, int n, int t,
+static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,
                              int dx, int dy) {
   int i, j;
   int count = 0, gsum = 0, gdiv;
   /* TODO: Make this code more efficient by breaking up into two loops */
-  for (i = -t; i < n; ++i)
-    for (j = -t; j < n; ++j) {
+  for (i = -ty; i < n; ++i)
+    for (j = -tx; j < n; ++j) {
       int g;
       if (i >= 0 && j >= 0) continue;
       if (i + dy >= 0 && j + dx >= 0) continue;
-      if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue;
+      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;
       g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);
       gsum += g * g;
       count++;
@@ -36,14 +36,15 @@ static int find_grad_measure(uint8_t *x, int stride, int n, int t,
 
 #if CONTEXT_PRED_REPLACEMENTS == 6
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imin = 1;
   for (i = 2; i < 8; i += 1 + (i == 3))
     imin = (g[i] < g[imin] ? i : imin);
@@ -73,12 +74,13 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 4
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imin = 1;
   for (i = 3; i < 8; i+=2)
     imin = (g[i] < g[imin] ? i : imin);
@@ -104,16 +106,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 0
 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n) {
+                                              int stride, int n,
+                                              int tx, int ty) {
   int g[8], i, imin, imax;
-  g[0] = find_grad_measure(ptr, stride, n, 4,  1, 0);
-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
-  g[4] = find_grad_measure(ptr, stride, n, 4,  0, 1);
-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);
+  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);
+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
+  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);
+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
   imax = 0;
   for (i = 1; i < 8; i++)
     imax = (g[i] > g[imax] ? i : imax);
@@ -144,10 +147,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
 }
 #endif
 
-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {
+  const int block_idx = x - xd->block;
+  const int have_top = (block_idx >> 2) || xd->up_available;
+  const int have_left = (block_idx & 3)  || xd->left_available;
   uint8_t *ptr = *(x->base_dst) + x->dst;
   int stride = x->dst_stride;
-  return vp9_find_dominant_direction(ptr, stride, 4);
+  int tx = have_left ? 4 : 0;
+  int ty = have_top ? 4 : 0;
+  if (!have_left && !have_top)
+    return B_DC_PRED;
+  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);
 }
 #endif
 
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 911fcc55e..0c2a5c94a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -279,11 +279,11 @@ specialize vp9_convolve8_avg_vert ssse3
 #
 # dct
 #
-prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm_1
+prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct4x4_1
 
-prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4llm sse2
+prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct4x4 sse2
 
 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8
@@ -330,10 +330,10 @@ specialize vp9_idct4_1d sse2
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_inv_walsh4x4_1_x8
-prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_inv_walsh4x4_x8
+prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_iwalsh4x4_1
+prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_iwalsh4x4
 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_inv_walsh_add
 
@@ -600,11 +600,11 @@ specialize vp9_short_fdct32x32
 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct16x16 sse2
 
-prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4_x8
+prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_walsh4x4
 
-prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh8x4_x8
+prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_walsh8x4
 
 #
 # Motion search
diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
index 8f3c6dfc3..8f3c6dfc3 100644
--- a/vp9/common/x86/vp9_idctllm_sse2.asm
+++ b/vp9/common/x86/vp9_idct_sse2.asm
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idct_x86.c
index 3d7a1481c..6a35823bd 100644
--- a/vp9/common/x86/vp9_idctllm_x86.c
+++ b/vp9/common/x86/vp9_idct_x86.c
@@ -74,7 +74,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index bfc0a9dde..b53e419b5 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -138,14 +138,14 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {
 
   if (mb->lossless) {
     assert(qindex == 0);
-    mb->inv_txm4x4_1      = vp9_short_inv_walsh4x4_1_x8;
-    mb->inv_txm4x4        = vp9_short_inv_walsh4x4_x8;
+    mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
+    mb->inv_txm4x4        = vp9_short_iwalsh4x4;
     mb->itxm_add          = vp9_dequant_idct_add_lossless_c;
     mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;
     mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;
   } else {
-    mb->inv_txm4x4_1      = vp9_short_idct4x4llm_1;
-    mb->inv_txm4x4        = vp9_short_idct4x4llm;
+    mb->inv_txm4x4_1      = vp9_short_idct4x4_1;
+    mb->inv_txm4x4        = vp9_short_idct4x4;
     mb->itxm_add          = vp9_dequant_idct_add;
     mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block;
     mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
@@ -377,7 +377,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
       int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
 #if CONFIG_NEWBINTRAMODES
       xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =
-          vp9_find_bpred_context(b);
+          vp9_find_bpred_context(xd, b);
 #endif
       if (!xd->mode_info_context->mbmi.mb_skip_coeff)
         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
@@ -1364,7 +1364,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
 
   /* Reset the frame pointers to the current frame size */
   vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
-                                pc->mb_cols * 16, pc->mb_rows * 16,
+                                pc->Width, pc->Height,
                                 VP9BORDERINPIXELS);
 
   if (vp9_start_decode(&header_bc, data,
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c
index 92b78ed19..cb4601a15 100644
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -126,7 +126,7 @@ void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
       input[i] *= dq[i];
 
     // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4llm(input, output, 4 << 1);
+    vp9_short_idct4x4(input, output, 4 << 1);
 
     vpx_memset(input, 0, 32);
 
@@ -148,7 +148,7 @@ void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
     input[i] *= dq[i];
 
   // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4llm(input, output, 4 << 1);
+  vp9_short_idct4x4(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
   vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
@@ -163,7 +163,7 @@ void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
     for (i = 0; i < 16; i++)
       input[i] *= dq[i];
 
-    vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
 
     vpx_memset(input, 0, 32);
 
@@ -186,7 +186,7 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
   for (i = 1; i < 16; i++)
     input[i] *= dq[i];
 
-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
   vp9_add_residual_4x4(output, pred, pitch, dest, stride);
 }
diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h
index 748fc7ea3..cd71166e4 100644
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@@ -46,14 +46,16 @@ extern "C" {
                         int64_t *time_stamp, int64_t *time_end_stamp,
                         vp9_ppflags_t *flags);
 
-  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
+  vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,
+                                         VP9_REFFRAME ref_frame_flag,
+                                         YV12_BUFFER_CONFIG *sd);
 
   vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
                                         VP9_REFFRAME ref_frame_flag,
                                         YV12_BUFFER_CONFIG *sd);
 
+  int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
+
   VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
 
   void vp9_remove_decompressor(VP9D_PTR comp);
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 63895800d..5cb2a095b 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -164,8 +164,9 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
 }
 
 
-vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                                      YV12_BUFFER_CONFIG *sd) {
+vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
   int ref_fb_idx;
@@ -242,6 +243,17 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
 }
 
 
+int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+  VP9_COMMON *cm = &pbi->common;
+
+  if (index < 0 || index >= NUM_REF_FRAMES)
+    return -1;
+
+  *fb = &cm->yv12_fb[cm->ref_frame_map[index]];
+  return 0;
+}
+
 /* If any buffer updating is signalled it should be done here. */
 static void swap_frame_buffers(VP9D_COMP *pbi) {
   int ref_index = 0, mask;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index bb6e44fea..6365ed9a2 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -553,7 +553,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
   }
 }
 
-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
   short *ip = input;
@@ -593,9 +593,9 @@ void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {
   }
 }
 
-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
-  vp9_short_walsh4x4_x8_c(input,   output,    pitch);
-  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
+void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {
+  vp9_short_walsh4x4_c(input,   output,    pitch);
+  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
 }
 
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b7b270031..428e585e1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -594,9 +594,6 @@ static void update_state(VP9_COMP *cpi,
           [vp9_switchable_interp_map[mbmi->interp_filter]];
     }
 
-    cpi->prediction_error += ctx->distortion;
-    cpi->intra_error += ctx->intra_error;
-
     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;
     cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;
@@ -1217,10 +1214,10 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
-    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4_x8;
-    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4_x8;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;
+    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
+    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
     cpi->mb.optimize              = 0;
     cpi->common.filter_level      = 0;
     cpi->zbin_mode_boost_enabled  = FALSE;
@@ -1228,8 +1225,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   } else {
     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
   }
 }
 
@@ -1265,8 +1262,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
-  cpi->prediction_error = 0;
-  cpi->intra_error = 0;
   cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
   cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
 
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 3c98d4aa6..9e5bcea16 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -44,7 +44,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   TX_TYPE tx_type;
 
 #if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(b);
+  b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);
 #endif
 
   vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index f30403cda..0b907b361 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -859,6 +859,8 @@ static double calc_correction_factor(double err_per_mb,
   power_term = (power_term > pt_high) ? pt_high : power_term;
 
   // Calculate correction factor
+  if (power_term < 1.0)
+    assert(error_term >= 0.0);
   correction_factor = pow(error_term, power_term);
 
   // Clip range
@@ -920,15 +922,19 @@ static int estimate_max_q(VP9_COMP *cpi,
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
+  if (fpstats->sr_coded_error > fpstats->coded_error) {
+    sr_err_diff =
+      (fpstats->sr_coded_error - fpstats->coded_error) /
+      (fpstats->count * cpi->common.MBs);
+    sr_correction = (sr_err_diff / 32.0);
+    sr_correction = pow(sr_correction, 0.25);
+    if (sr_correction < 0.75)
+      sr_correction = 0.75;
+    else if (sr_correction > 1.25)
+      sr_correction = 1.25;
+  } else {
     sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
+  }
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
@@ -1031,15 +1037,19 @@ static int estimate_cq(VP9_COMP *cpi,
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
-  sr_err_diff =
-    (fpstats->sr_coded_error - fpstats->coded_error) /
-    (fpstats->count * cpi->common.MBs);
-  sr_correction = (sr_err_diff / 32.0);
-  sr_correction = pow(sr_correction, 0.25);
-  if (sr_correction < 0.75)
+  if (fpstats->sr_coded_error > fpstats->coded_error) {
+    sr_err_diff =
+      (fpstats->sr_coded_error - fpstats->coded_error) /
+      (fpstats->count * cpi->common.MBs);
+    sr_correction = (sr_err_diff / 32.0);
+    sr_correction = pow(sr_correction, 0.25);
+    if (sr_correction < 0.75)
+      sr_correction = 0.75;
+    else if (sr_correction > 1.25)
+      sr_correction = 1.25;
+  } else {
     sr_correction = 0.75;
-  else if (sr_correction > 1.25)
-    sr_correction = 1.25;
+  }
 
   // II ratio correction factor for clip as a whole
   clip_iiratio = cpi->twopass.total_stats->intra_error /
@@ -1178,12 +1188,16 @@ static double get_prediction_decay_rate(VP9_COMP *cpi,
   mb_sr_err_diff =
     (next_frame->sr_coded_error - next_frame->coded_error) /
     (cpi->common.MBs);
-  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
-  second_ref_decay = pow(second_ref_decay, 0.5);
-  if (second_ref_decay < 0.85)
+  if (mb_sr_err_diff <= 512.0) {
+    second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
+    second_ref_decay = pow(second_ref_decay, 0.5);
+    if (second_ref_decay < 0.85)
+      second_ref_decay = 0.85;
+    else if (second_ref_decay > 1.0)
+      second_ref_decay = 1.0;
+  } else {
     second_ref_decay = 0.85;
-  else if (second_ref_decay > 1.0)
-    second_ref_decay = 1.0;
+  }
 
   if (second_ref_decay < prediction_decay_rate)
     prediction_decay_rate = second_ref_decay;
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 2214ac99b..a89d2547e 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -62,10 +62,6 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
   // Clamp the lookahead queue depth
   depth = clamp(depth, 1, MAX_LAG_BUFFERS);
 
-  // Align the buffer dimensions
-  width = (width + 15) &~15;
-  height = (height + 15) &~15;
-
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 4901e6a90..cd8e74624 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -843,8 +843,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;
   cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4_x8;
-    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4_x8;
+    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4;
+    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
@@ -873,9 +873,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
-  int width = (cpi->oxcf.Width + 15) & ~15;
-  int height = (cpi->oxcf.Height + 15) & ~15;
-
   cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
                                       cpi->oxcf.lag_in_frames);
   if (!cpi->lookahead)
@@ -885,7 +882,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
 #if VP9_TEMPORAL_ALT_REF
 
   if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                  width, height, VP9BORDERINPIXELS))
+                                  cpi->oxcf.Width, cpi->oxcf.Height,
+                                  VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 
@@ -909,10 +907,7 @@ static int alloc_partition_data(VP9_COMP *cpi) {
 void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
-  int width = cm->Width;
-  int height = cm->Height;
-
-  if (vp9_alloc_frame_buffers(cm, width, height))
+  if (vp9_alloc_frame_buffers(cm, cm->Width, cm->Height))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffers");
 
@@ -920,21 +915,13 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate partition data");
 
-
-  if ((width & 0xf) != 0)
-    width += 16 - (width & 0xf);
-
-  if ((height & 0xf) != 0)
-    height += 16 - (height & 0xf);
-
-
   if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
-                                  width, height, VP9BORDERINPIXELS))
+                                  cm->Width, cm->Height, VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
-                                  width, height, VP9BORDERINPIXELS))
+                                  cm->Width, cm->Height, VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -996,11 +983,11 @@ static void update_frame_size(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
   /* our internal buffers are always multiples of 16 */
-  int width = (cm->Width + 15) & ~15;
-  int height = (cm->Height + 15) & ~15;
+  int aligned_width = (cm->Width + 15) & ~15;
+  int aligned_height = (cm->Height + 15) & ~15;
 
-  cm->mb_rows = height >> 4;
-  cm->mb_cols = width >> 4;
+  cm->mb_rows = aligned_height >> 4;
+  cm->mb_cols = aligned_width >> 4;
   cm->MBs = cm->mb_rows * cm->mb_cols;
   cm->mode_info_stride = cm->mb_cols + 1;
   memset(cm->mip, 0,
@@ -1013,12 +1000,12 @@ static void update_frame_size(VP9_COMP *cpi) {
 
   /* Update size of buffers local to this frame */
   if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,
-                                    width, height, VP9BORDERINPIXELS))
+                                    cm->Width, cm->Height, VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
   if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,
-                                    width, height, VP9BORDERINPIXELS))
+                                    cm->Width, cm->Height, VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1217,11 +1204,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_inv_walsh4x4_1_x8;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_inv_walsh4x4_x8;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4llm_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4llm;
+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1315,9 +1302,6 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cm->Width       = cpi->oxcf.Width;
   cm->Height      = cpi->oxcf.Height;
 
-  cm->horiz_scale  = cpi->horiz_scale;
-  cm->vert_scale   = cpi->vert_scale;
-
   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
   if (cpi->oxcf.Sharpness > 7)
     cpi->oxcf.Sharpness = 7;
@@ -2103,8 +2087,8 @@ int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
   return 0;
 }
 
-int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
-                          YV12_BUFFER_CONFIG *sd) {
+int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
   VP9_COMMON *cm = &cpi->common;
   int ref_fb_idx;
@@ -2123,6 +2107,17 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   return 0;
 }
 
+int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
+  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMMON *cm = &cpi->common;
+
+  if (index < 0 || index >= NUM_REF_FRAMES)
+    return -1;
+
+  *fb = &cm->yv12_fb[cm->ref_frame_map[index]];
+  return 0;
+}
+
 int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
@@ -2212,10 +2207,10 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 
 static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
                                    YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_width;
-  const int in_h = src_fb->y_height;
-  const int out_w = dst_fb->y_width;
-  const int out_h = dst_fb->y_height;
+  const int in_w = src_fb->y_crop_width;
+  const int in_h = src_fb->y_crop_height;
+  const int out_w = dst_fb->y_crop_width;
+  const int out_h = dst_fb->y_crop_height;
   int x, y;
 
   for (y = 0; y < out_h; y += 16) {
@@ -2617,12 +2612,12 @@ static void scale_references(VP9_COMP *cpi) {
   for (i = 0; i < 3; i++) {
     YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
 
-    if (ref->y_width != cm->mb_cols * 16 || ref->y_height != cm->mb_rows * 16) {
+    if (ref->y_crop_width != cm->Width ||
+        ref->y_crop_height != cm->Height) {
       int new_fb = get_free_fb(cm);
 
       vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
-                                    cm->mb_cols * 16,
-                                    cm->mb_rows * 16,
+                                    cm->Width, cm->Height,
                                     VP9BORDERINPIXELS);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
@@ -3897,7 +3892,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
   /* Reset the frame pointers to the current frame size */
   vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
-                                cm->mb_cols * 16, cm->mb_rows * 16,
+                                cm->Width, cm->Height,
                                 VP9BORDERINPIXELS);
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 5de6a7ad2..7a1a9b249 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -390,11 +390,6 @@ typedef struct VP9_COMP {
   CODING_CONTEXT coding_context;
 
   // Rate targetting variables
-  int64_t prediction_error;
-  int64_t last_prediction_error;
-  int64_t intra_error;
-  int64_t last_intra_error;
-
   int this_frame_target;
   int projected_frame_size;
   int last_q[2];                   // Separate values for Intra/Inter
@@ -546,8 +541,6 @@ typedef struct VP9_COMP {
   int goldfreq;
   int auto_worst_q;
   int cpu_used;
-  int horiz_scale;
-  int vert_scale;
   int pass;
 
   vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 6f9333521..d80ea02c1 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -247,7 +247,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
   int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
 
   //  Make a copy of the unfiltered / processed recon buffer
-  vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);
+  vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
   if (cm->frame_type == KEY_FRAME)
     cm->sharpness_level = 0;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2f03a264c..a8ea3956e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1165,7 +1165,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
 #if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(b);
+  b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);
 #endif
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
@@ -1276,7 +1276,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
       bmode_costs  = mb->bmode_costs[A][L];
     }
 #if CONFIG_NEWBINTRAMODES
-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i);
+    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);
 #endif
 
     total_rd += rd_pick_intra4x4block(
@@ -3441,9 +3441,11 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
-  mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,
-          frame_type, block_size);
-
+  // The current implementation doesn't support scaling.
+  if (scale[frame_type].x_num == scale[frame_type].x_den &&
+      scale[frame_type].y_num == scale[frame_type].y_den)
+    mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,
+            frame_type, block_size);
 }
 
 static void model_rd_from_var_lapndz(int var, int n, int qstep,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index fb61ece09..5df4d2562 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -457,8 +457,8 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
   vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
       &cpi->common.yv12_fb[cpi->common.new_fb_idx],
-      16 * cpi->common.mb_cols,
-      16 * cpi->common.mb_rows);
+      cpi->common.Width,
+      cpi->common.Height);
   cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
 
   // Setup frame pointers, NULL indicates frame not included in filter
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f330b464a..ea8631711 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -9,6 +9,7 @@
 ##
 
 VP9_COMMON_SRCS-yes += vp9_common.mk
+VP9_COMMON_SRCS-yes += vp9_iface_common.h
 VP9_COMMON_SRCS-yes += common/vp9_pragmas.h
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_onyx.h
@@ -28,7 +29,7 @@ VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c
 VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c
-VP9_COMMON_SRCS-yes += common/vp9_idctllm.c
+VP9_COMMON_SRCS-yes += common/vp9_idct.c
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
 VP9_COMMON_SRCS-yes += common/vp9_blockd.h
 VP9_COMMON_SRCS-yes += common/vp9_common.h
@@ -91,7 +92,7 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
@@ -110,13 +111,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 endif
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2
+vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2
+vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 708cec602..1eeec6b5a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -16,6 +16,7 @@
 #include "vpx/vp8cx.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/common/vp9_onyx.h"
+#include "vp9/vp9_iface_common.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -544,6 +545,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
   yv12->u_buffer = img->planes[VPX_PLANE_U];
   yv12->v_buffer = img->planes[VPX_PLANE_V];
 
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
   yv12->y_width  = img->d_w;
   yv12->y_height = img->d_h;
   yv12->uv_width = (1 + yv12->y_width) / 2;
@@ -867,9 +870,9 @@ static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
 
 }
 
-static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
-                                          int ctr_id,
-                                          va_list args) {
+static vpx_codec_err_t vp8e_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                           int ctr_id,
+                                           va_list args) {
 
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -878,12 +881,28 @@ static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd);
     return VPX_CODEC_OK;
   } else
     return VPX_CODEC_INVALID_PARAM;
 }
 
+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
+                                     int ctr_id,
+                                     va_list args) {
+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+
+  if (data) {
+    YV12_BUFFER_CONFIG* fb;
+
+    vp9_get_reference_enc(ctx->cpi, data->idx, &fb);
+    yuvconfig2image(&data->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
@@ -1038,7 +1057,7 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   {VP8_SET_REFERENCE,                 vp8e_set_reference},
-  {VP8_COPY_REFERENCE,                vp8e_get_reference},
+  {VP8_COPY_REFERENCE,                vp8e_copy_reference},
   {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
   {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
   {VP8E_UPD_REFERENCE,                vp8e_update_reference},
@@ -1062,6 +1081,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
   {VP8E_SET_CQ_LEVEL,                 set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
+  {VP9_GET_REFERENCE,                 get_reference},
   { -1, NULL},
 };
 
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index eabdb8556..66c89b5a9 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -17,6 +17,7 @@
 #include "vpx_version.h"
 #include "decoder/vp9_onyxd.h"
 #include "decoder/vp9_onyxd_int.h"
+#include "vp9/vp9_iface_common.h"
 
 #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 typedef vpx_codec_stream_info_t  vp8_stream_info_t;
@@ -273,36 +274,6 @@ update_error_state(vpx_codec_alg_priv_t                 *ctx,
   return res;
 }
 
-static void yuvconfig2image(vpx_image_t               *img,
-                            const YV12_BUFFER_CONFIG  *yv12,
-                            void                      *user_priv) {
-  /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
-  img->fmt = yv12->clrtype == REG_YUV ?
-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
-  img->w = yv12->y_stride;
-  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;
-  img->d_w = yv12->y_width;
-  img->d_h = yv12->y_height;
-  img->x_chroma_shift = 1;
-  img->y_chroma_shift = 1;
-  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
-  img->planes[VPX_PLANE_U] = yv12->u_buffer;
-  img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = NULL;
-  img->stride[VPX_PLANE_Y] = yv12->y_stride;
-  img->stride[VPX_PLANE_U] = yv12->uv_stride;
-  img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
-  img->bps = 12;
-  img->user_priv = user_priv;
-  img->img_data = yv12->buffer_alloc;
-  img->img_data_owner = 0;
-  img->self_allocd = 0;
-}
-
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
                                   const uint8_t        **data,
                                   unsigned int           data_sz,
@@ -613,6 +584,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
   yv12->u_buffer = img->planes[VPX_PLANE_U];
   yv12->v_buffer = img->planes[VPX_PLANE_V];
 
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
   yv12->y_width  = img->d_w;
   yv12->y_height = img->d_h;
   yv12->uv_width = yv12->y_width / 2;
@@ -648,9 +621,9 @@ static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
 
 }
 
-static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
-                                         int ctr_id,
-                                         va_list args) {
+static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx,
+                                          int ctr_id,
+                                          va_list args) {
 
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
@@ -660,13 +633,29 @@ static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,
 
     image2yuvconfig(&frame->img, &sd);
 
-    return vp9_get_reference_dec(ctx->pbi,
-                                 (VP9_REFFRAME)frame->frame_type, &sd);
+    return vp9_copy_reference_dec(ctx->pbi,
+                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else
     return VPX_CODEC_INVALID_PARAM;
 
 }
 
+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
+                                     int ctr_id,
+                                     va_list args) {
+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
+
+  if (data) {
+    YV12_BUFFER_CONFIG* fb;
+
+    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
+    yuvconfig2image(&data->img, fb, NULL);
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
                                         int ctr_id,
                                         va_list args) {
@@ -739,7 +728,7 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8_SET_REFERENCE,             vp9_set_reference},
-  {VP8_COPY_REFERENCE,            vp9_get_reference},
+  {VP8_COPY_REFERENCE,            vp9_copy_reference},
   {VP8_SET_POSTPROC,              vp8_set_postproc},
   {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
   {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
@@ -747,6 +736,7 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
   {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
   {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+  {VP9_GET_REFERENCE,             get_reference},
   { -1, NULL},
 };
 
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
new file mode 100644
index 000000000..450be7dfd
--- /dev/null
+++ b/vp9/vp9_iface_common.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_VP9_IFACE_COMMON_H_
+#define VP9_VP9_IFACE_COMMON_H_
+
+static void yuvconfig2image(vpx_image_t               *img,
+                            const YV12_BUFFER_CONFIG  *yv12,
+                            void                      *user_priv) {
+  /** vpx_img_wrap() doesn't allow specifying independent strides for
+    * the Y, U, and V planes, nor other alignment adjustments that
+    * might be representable by a YV12_BUFFER_CONFIG, so we just
+    * initialize all the fields.*/
+  img->fmt = yv12->clrtype == REG_YUV ?
+             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+  img->w = yv12->y_stride;
+  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;
+  img->d_w = yv12->y_width;
+  img->d_h = yv12->y_height;
+  img->x_chroma_shift = 1;
+  img->y_chroma_shift = 1;
+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+  img->planes[VPX_PLANE_U] = yv12->u_buffer;
+  img->planes[VPX_PLANE_V] = yv12->v_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->stride[VPX_PLANE_Y] = yv12->y_stride;
+  img->stride[VPX_PLANE_U] = yv12->uv_stride;
+  img->stride[VPX_PLANE_V] = yv12->uv_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+  img->bps = 12;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+#endif
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 3c313632b..0b4cb1b9e 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -44,6 +44,12 @@ enum vp8_com_control_id {
   VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
   VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
   VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
+
+  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
+   * for its control ids. These should be migrated to something like the
+   * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
+   */
+  VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */
   VP8_COMMON_CTRL_ID_MAX,
   VP8_DECODER_CTRL_ID_START   = 256
 };
@@ -97,6 +103,10 @@ typedef struct vpx_ref_frame {
   vpx_image_t           img;          /**< reference frame data in image format */
 } vpx_ref_frame_t;
 
+typedef struct vp9_ref_frame {
+  int idx; /**< frame index to get (input) */
+  vpx_image_t  img; /**< img structure to populate (output) */
+} vp9_ref_frame_t;
 
 /*!\brief vp8 decoder control function parameter type
  *
@@ -110,6 +120,7 @@ VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_REF_FRAME, int)
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)
 VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)
+VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 
 
 /*! @} - end defgroup vp8 */
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 267d55f40..fc7f82881 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -38,10 +38,12 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
 int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                   int width, int height, int border) {
   if (ybf) {
-    int y_stride = ((width + 2 * border) + 31) & ~31;
-    int yplane_size = (height + 2 * border) * y_stride;
-    int uv_width = width >> 1;
-    int uv_height = height >> 1;
+    int aligned_width = (width + 15) & ~15;
+    int aligned_height = (height + 15) & ~15;
+    int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    int yplane_size = (aligned_height + 2 * border) * y_stride;
+    int uv_width = aligned_width >> 1;
+    int uv_height = aligned_height >> 1;
     /** There is currently a bunch of code which assumes
       *  uv_stride == y_stride/2, so enforce this here. */
     int uv_stride = y_stride >> 1;
@@ -56,17 +58,18 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)
       return -1;
 
-    /** Only support allocating buffers that have a height and width that
-      *  are multiples of 16, and a border that's a multiple of 32.
-      * The border restriction is required to get 16-byte alignment of the
-      *  start of the chroma rows without intoducing an arbitrary gap
-      *  between planes, which would break the semantics of things like
-      *  vpx_img_set_rect(). */
-    if ((width & 0xf) | (height & 0xf) | (border & 0x1f))
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without intoducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * vpx_img_set_rect(). */
+    if (border & 0x1f)
       return -3;
 
-    ybf->y_width  = width;
-    ybf->y_height = height;
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width  = aligned_width;
+    ybf->y_height = aligned_height;
     ybf->y_stride = y_stride;
 
     ybf->uv_width = uv_width;
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index d733bd49d..49d7e8e56 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -20,180 +20,81 @@
 /****************************************************************************
  *
  ****************************************************************************/
-void
-vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+static void extend_plane(uint8_t *s,       /* source */
+                         int sp,           /* source pitch */
+                         int w,            /* width */
+                         int h,            /* height */
+                         int et,           /* extend top border */
+                         int el,           /* extend left border */
+                         int eb,           /* extend bottom border */
+                         int er) {         /* extend right border */
   int i;
-  unsigned char *src_ptr1, *src_ptr2;
-  unsigned char *dest_ptr1, *dest_ptr2;
-
-  unsigned int Border;
-  int plane_stride;
-  int plane_height;
-  int plane_width;
-
-  /***********/
-  /* Y Plane */
-  /***********/
-  Border = ybf->border;
-  plane_stride = ybf->y_stride;
-  plane_height = ybf->y_height;
-  plane_width = ybf->y_width;
+  uint8_t *src_ptr1, *src_ptr2;
+  uint8_t *dest_ptr1, *dest_ptr2;
+  int linesize;
 
   /* copy the left and right most columns out */
-  src_ptr1 = ybf->y_buffer;
-  src_ptr2 = src_ptr1 + plane_width - 1;
-  dest_ptr1 = src_ptr1 - Border;
-  dest_ptr2 = src_ptr2 + 1;
-
-  for (i = 0; i < plane_height; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], Border);
-    vpx_memset(dest_ptr2, src_ptr2[0], Border);
-    src_ptr1  += plane_stride;
-    src_ptr2  += plane_stride;
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
+  src_ptr1 = s;
+  src_ptr2 = s + w - 1;
+  dest_ptr1 = s - el;
+  dest_ptr2 = s + w;
+
+  for (i = 0; i < h; i++) {
+    vpx_memset(dest_ptr1, src_ptr1[0], el);
+    vpx_memset(dest_ptr2, src_ptr2[0], er);
+    src_ptr1  += sp;
+    src_ptr2  += sp;
+    dest_ptr1 += sp;
+    dest_ptr2 += sp;
   }
 
-  /* Now copy the top and bottom source lines into each line of the respective borders */
-  src_ptr1 = ybf->y_buffer - Border;
-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-  dest_ptr1 = src_ptr1 - (Border * plane_stride);
-  dest_ptr2 = src_ptr2 + plane_stride;
-
-  for (i = 0; i < (int)Border; i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = s - el;
+  src_ptr2 = s + sp * (h - 1) - el;
+  dest_ptr1 = s + sp * (-et) - el;
+  dest_ptr2 = s + sp * (h) - el;
+  linesize = el + er + w;
+
+  for (i = 0; i < et; i++) {
+    vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+    dest_ptr1 += sp;
   }
 
-
-  /***********/
-  /* U Plane */
-  /***********/
-  plane_stride = ybf->uv_stride;
-  plane_height = ybf->uv_height;
-  plane_width = ybf->uv_width;
-  Border /= 2;
-
-  /* copy the left and right most columns out */
-  src_ptr1 = ybf->u_buffer;
-  src_ptr2 = src_ptr1 + plane_width - 1;
-  dest_ptr1 = src_ptr1 - Border;
-  dest_ptr2 = src_ptr2 + 1;
-
-  for (i = 0; i < plane_height; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], Border);
-    vpx_memset(dest_ptr2, src_ptr2[0], Border);
-    src_ptr1  += plane_stride;
-    src_ptr2  += plane_stride;
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
-  }
-
-  /* Now copy the top and bottom source lines into each line of the respective borders */
-  src_ptr1 = ybf->u_buffer - Border;
-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-  dest_ptr1 = src_ptr1 - (Border * plane_stride);
-  dest_ptr2 = src_ptr2 + plane_stride;
-
-  for (i = 0; i < (int)(Border); i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
-  }
-
-  /***********/
-  /* V Plane */
-  /***********/
-
-  /* copy the left and right most columns out */
-  src_ptr1 = ybf->v_buffer;
-  src_ptr2 = src_ptr1 + plane_width - 1;
-  dest_ptr1 = src_ptr1 - Border;
-  dest_ptr2 = src_ptr2 + 1;
-
-  for (i = 0; i < plane_height; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], Border);
-    vpx_memset(dest_ptr2, src_ptr2[0], Border);
-    src_ptr1  += plane_stride;
-    src_ptr2  += plane_stride;
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
-  }
-
-  /* Now copy the top and bottom source lines into each line of the respective borders */
-  src_ptr1 = ybf->v_buffer - Border;
-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-  dest_ptr1 = src_ptr1 - (Border * plane_stride);
-  dest_ptr2 = src_ptr2 + plane_stride;
-
-  for (i = 0; i < (int)(Border); i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
+  for (i = 0; i < eb; i++) {
+    vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+    dest_ptr2 += sp;
   }
 }
 
-
-static void
-extend_frame_borders_yonly_c(YV12_BUFFER_CONFIG *ybf) {
-  int i;
-  unsigned char *src_ptr1, *src_ptr2;
-  unsigned char *dest_ptr1, *dest_ptr2;
-
-  unsigned int Border;
-  int plane_stride;
-  int plane_height;
-  int plane_width;
-
-  /***********/
-  /* Y Plane */
-  /***********/
-  Border = ybf->border;
-  plane_stride = ybf->y_stride;
-  plane_height = ybf->y_height;
-  plane_width = ybf->y_width;
-
-  /* copy the left and right most columns out */
-  src_ptr1 = ybf->y_buffer;
-  src_ptr2 = src_ptr1 + plane_width - 1;
-  dest_ptr1 = src_ptr1 - Border;
-  dest_ptr2 = src_ptr2 + 1;
-
-  for (i = 0; i < plane_height; i++) {
-    vpx_memset(dest_ptr1, src_ptr1[0], Border);
-    vpx_memset(dest_ptr2, src_ptr2[0], Border);
-    src_ptr1  += plane_stride;
-    src_ptr2  += plane_stride;
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
-  }
-
-  /* Now copy the top and bottom source lines into each line of the respective borders */
-  src_ptr1 = ybf->y_buffer - Border;
-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-  dest_ptr1 = src_ptr1 - (Border * plane_stride);
-  dest_ptr2 = src_ptr2 + plane_stride;
-
-  for (i = 0; i < (int)Border; i++) {
-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-    dest_ptr1 += plane_stride;
-    dest_ptr2 += plane_stride;
-  }
-
-  plane_stride /= 2;
-  plane_height /= 2;
-  plane_width /= 2;
-  Border /= 2;
-
+void
+vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ybf->border, ybf->border,
+               ybf->border + ybf->y_height - ybf->y_crop_height,
+               ybf->border + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+               ybf->border / 2, ybf->border / 2,
+               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+               ybf->border / 2, ybf->border / 2,
+               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
 }
 
 
-
 /****************************************************************************
  *
  *  ROUTINE       : vp8_yv12_copy_frame
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 45e57f401..14b6e278b 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -42,6 +42,8 @@ extern "C" {
   typedef struct yv12_buffer_config {
     int   y_width;
     int   y_height;
+    int   y_crop_width;
+    int   y_crop_height;
     int   y_stride;
     /*    int   yinternal_width; */
 
diff --git a/vpxdec.c b/vpxdec.c
index 30196ecc8..287e796ae 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -711,7 +711,7 @@ int main(int argc, const char **argv_) {
   struct input_ctx        input = {0};
   int                     frames_corrupted = 0;
   int                     dec_flags = 0;
-  int                     do_scale;
+  int                     do_scale = 0;
   int                     stream_w = 0, stream_h = 0;
   vpx_image_t             *scaled_img = NULL;
 
diff --git a/vpxenc.c b/vpxenc.c
index 19e10820c..e915efdfa 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1645,8 +1645,6 @@ struct stream_state {
   stats_io_t                stats;
   struct vpx_image         *img;
   vpx_codec_ctx_t           decoder;
-  vpx_ref_frame_t           ref_enc;
-  vpx_ref_frame_t           ref_dec;
   int                       mismatch_seen;
 };
 
@@ -2235,16 +2233,7 @@ static void initialize_encoder(struct stream_state  *stream,
 
 #if CONFIG_DECODERS
   if (global->test_decode != TEST_DECODE_OFF) {
-    int width, height;
-
     vpx_codec_dec_init(&stream->decoder, global->codec->dx_iface(), NULL, 0);
-
-    width = (stream->config.cfg.g_w + 15) & ~15;
-    height = (stream->config.cfg.g_h + 15) & ~15;
-    vpx_img_alloc(&stream->ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
-    vpx_img_alloc(&stream->ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
-    stream->ref_enc.frame_type = VP8_LAST_FRAME;
-    stream->ref_dec.frame_type = VP8_LAST_FRAME;
   }
 #endif
 }
@@ -2429,19 +2418,44 @@ static float usec_to_fps(uint64_t usec, unsigned int frames) {
 
 
 static void test_decode(struct stream_state  *stream,
-                        enum TestDecodeFatality fatal) {
+                        enum TestDecodeFatality fatal,
+                        const struct codec_item *codec) {
+  vpx_image_t enc_img, dec_img;
+
   if (stream->mismatch_seen)
     return;
 
-  vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &stream->ref_enc);
+  /* Get the internal reference frame */
+  if (codec->fourcc == VP8_FOURCC) {
+    struct vpx_ref_frame ref_enc, ref_dec;
+    int width, height;
+
+    width = (stream->config.cfg.g_w + 15) & ~15;
+    height = (stream->config.cfg.g_h + 15) & ~15;
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    enc_img = ref_enc.img;
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    dec_img = ref_dec.img;
+
+    ref_enc.frame_type = VP8_LAST_FRAME;
+    ref_dec.frame_type = VP8_LAST_FRAME;
+    vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
+    vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
+  } else {
+    struct vp9_ref_frame ref;
+
+    ref.idx = 0;
+    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);
+    enc_img = ref.img;
+    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);
+    dec_img = ref.img;
+  }
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
-  vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &stream->ref_dec);
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
-  if (!compare_img(&stream->ref_enc.img, &stream->ref_dec.img)) {
+  if (!compare_img(&enc_img, &dec_img)) {
     int y[2], u[2], v[2];
-    find_mismatch(&stream->ref_enc.img, &stream->ref_dec.img,
-                  y, u, v);
+    find_mismatch(&enc_img, &dec_img, y, u, v);
     stream->decoder.err = 1;
     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
                           "Stream %d: Encode/decode mismatch on frame %d"
@@ -2450,6 +2464,9 @@ static void test_decode(struct stream_state  *stream,
                           y[0], y[1], u[0], u[1], v[0], v[1]);
     stream->mismatch_seen = stream->frames_out;
   }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
 }
 
 
@@ -2671,7 +2688,7 @@ int main(int argc, const char **argv_) {
         }
 
         if (got_data && global.test_decode != TEST_DECODE_OFF)
-          FOREACH_STREAM(test_decode(stream, global.test_decode));
+          FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));
       }
 
       fflush(stdout);
@@ -2703,8 +2720,6 @@ int main(int argc, const char **argv_) {
 
     if (global.test_decode != TEST_DECODE_OFF) {
       FOREACH_STREAM(vpx_codec_destroy(&stream->decoder));
-      FOREACH_STREAM(vpx_img_free(&stream->ref_enc.img));
-      FOREACH_STREAM(vpx_img_free(&stream->ref_dec.img));
     }
 
     close_input_file(&input);