23 files changed, 424 insertions, 368 deletions
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 8b04d1b43..e209788c3 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -141,68 +141,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   return 1;
 }
 
-static void init_frame_bufs(VP9_COMMON *cm) {
-  BufferPool *const pool = cm->buffer_pool;
-  int i;
-
-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  pool->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = i;
-    pool->frame_bufs[i].ref_count = 1;
-  }
-}
-
-int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i;
-  const int ss_x = cm->subsampling_x;
-  const int ss_y = cm->subsampling_y;
-
-  vp9_free_ref_frame_buffers(cm);
-
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    BufferPool *const pool = cm->buffer_pool;
-    pool->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&pool->frame_bufs[i].buf, width, height,
-                               ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               VP9_ENC_BORDER_IN_PIXELS,
-                               cm->byte_alignment) < 0)
-      goto fail;
-    if (pool->frame_bufs[i].mvs == NULL) {
-      pool->frame_bufs[i].mvs =
-          (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                               sizeof(*pool->frame_bufs[i].mvs));
-      if (pool->frame_bufs[i].mvs == NULL)
-        goto fail;
-
-      pool->frame_bufs[i].mi_rows = cm->mi_rows;
-      pool->frame_bufs[i].mi_cols = cm->mi_cols;
-    }
-  }
-
-  init_frame_bufs(cm);
-
-#if CONFIG_VP9_POSTPROC
-  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                             cm->use_highbitdepth,
-#endif
-                             VP9_ENC_BORDER_IN_PIXELS,
-                             cm->byte_alignment) < 0)
-    goto fail;
-#endif
-
-  return 0;
-
- fail:
-  vp9_free_ref_frame_buffers(cm);
-  return 1;
-}
-
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_ref_frame_buffers(cm);
   vp9_free_context_buffers(cm);
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index 09da74e49..d82397fa3 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -12,6 +12,8 @@
 #ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
 #define VP9_COMMON_VP9_ALLOCCOMMON_H_
 
+#define INVALID_IDX -1  // Invalid buffer index.
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -24,7 +26,6 @@ int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
 void vp9_init_context_buffers(struct VP9Common *cm);
 void vp9_free_context_buffers(struct VP9Common *cm);
 
-int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height);
 void vp9_free_ref_frame_buffers(struct VP9Common *cm);
 
 int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index e614e6da8..1668b99ce 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -12,6 +12,7 @@
 #include "./vp9_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -579,7 +580,7 @@ static intra_high_pred_fn pred_high[INTRA_MODES][4];
 static intra_high_pred_fn dc_pred_high[2][2][4];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp9_init_intra_predictors() {
+static void vp9_init_intra_predictors_internal(void) {
 #define INIT_ALL_SIZES(p, type) \
   p[TX_4X4] = vp9_##type##_predictor_4x4; \
   p[TX_8X8] = vp9_##type##_predictor_8x8; \
@@ -894,3 +895,7 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                          have_top, have_left, have_right, x, y, plane);
 }
+
+void vp9_init_intra_predictors() {
+  once(vp9_init_intra_predictors_internal);
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index d345a0578..c183cf38e 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -934,7 +934,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
     winterface->sync(&pbi->lf_worker);
     vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
                                pbi->mb.plane);
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
   assert(tile_rows <= 4);
@@ -1362,7 +1361,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
     for (i = 0; i < REFS_PER_FRAME; ++i) {
-      cm->frame_refs[i].idx = -1;
+      cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
     }
 
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index 50c8bca0b..90d113c32 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -32,12 +32,13 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
 void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
                        const int ref_stride, const int height) {
   int idx;
+  const int norm_factor = MAX(8, height >> 1);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
     for (i = 0; i < height; ++i)
       hbuf[idx] += ref[i * ref_stride];
-    hbuf[idx] /= 32;
+    hbuf[idx] /= norm_factor;
     ++ref;
   }
 }
@@ -45,9 +46,10 @@ void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
 int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
   int idx;
   int16_t sum = 0;
+  const int norm_factor = MAX(8, width >> 1);
   for (idx = 0; idx < width; ++idx)
     sum += ref[idx];
-  return sum / 32;
+  return sum / norm_factor;
 }
 
 int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3a70364ae..b24fe2950 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -989,8 +989,6 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
-    found = cm->width == cfg->y_crop_width &&
-            cm->height == cfg->y_crop_height;
 
     // Set "found" to 0 for temporal svc and for spatial svc key frame
     if (cpi->use_svc &&
@@ -1003,6 +1001,9 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
          cpi->svc.layer_context[0].frames_from_key_frame <
          cpi->svc.number_temporal_layers + 1))) {
       found = 0;
+    } else if (cfg != NULL) {
+      found = cm->width == cfg->y_crop_width &&
+              cm->height == cfg->y_crop_height;
     }
     vp9_wb_write_bit(wb, found);
     if (found) {
@@ -1114,7 +1115,8 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       MV_REFERENCE_FRAME ref_frame;
       vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame),
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vp9_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
         vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 63ca2d34d..a86981a71 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -144,12 +144,14 @@ static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
                                                    BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
   const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride +
-                                              mi_col * MI_SIZE];
-  unsigned int sse;
-  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                                              last_y, last->y_stride, &sse);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -518,156 +520,8 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
 #define GLOBAL_MOTION 1
 #endif
 
-#if GLOBAL_MOTION
-static int vector_match(int16_t *ref, int16_t *src, int bwl) {
-  int best_sad = INT_MAX;
-  int this_sad;
-  int d;
-  int center, offset = 0;
-  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
-  for (d = 0; d <= bw; d += 16) {
-    this_sad = vp9_vector_var(&ref[d], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      offset = d;
-    }
-  }
-  center = offset;
-
-  for (d = -8; d <= 8; d += 16) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw)
-      continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -4; d <= 4; d += 8) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw)
-      continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -2; d <= 2; d += 4) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw)
-      continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-  offset = center;
-
-  for (d = -1; d <= 1; d += 2) {
-    int this_pos = offset + d;
-    // check limit
-    if (this_pos < 0 || this_pos > bw)
-      continue;
-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      center = this_pos;
-    }
-  }
-
-  return (center - (bw >> 1));
-}
-
-static const MV search_pos[9] = {
-  {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 0}, {0, 1},
-  {1, -1}, {1, 0}, {1, 1},
-};
-
-static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x,
-                              BLOCK_SIZE bsize) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
-  int idx;
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int search_width = bw << 1;
-  const int search_height = bh << 1;
-  const int src_stride = x->plane[0].src.stride;
-  const int ref_stride = xd->plane[0].pre[0].stride;
-  uint8_t const *ref_buf, *src_buf;
-  MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv;
-  int best_sad;
-  MV this_mv;
-
-  // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
-  for (idx = 0; idx < search_width; idx += 16) {
-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
-    ref_buf += 16;
-  }
-
-  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
-  for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = vp9_int_pro_col(ref_buf, bw);
-    ref_buf += ref_stride;
-  }
-
-  // Set up src 1-D reference set
-  for (idx = 0; idx < bw; idx += 16) {
-    src_buf = x->plane[0].src.buf + idx;
-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
-  }
-
-  src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < bh; ++idx) {
-    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw);
-    src_buf += src_stride;
-  }
-
-  // Find the best match per 1-D search
-  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
-  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
-
-  best_sad = INT_MAX;
-  this_mv = *tmp_mv;
-  for (idx = 0; idx < 9; ++idx) {
-    int this_sad;
-    src_buf = x->plane[0].src.buf;
-    ref_buf = xd->plane[0].pre[0].buf +
-        (search_pos[idx].row + this_mv.row) * ref_stride +
-        (search_pos[idx].col + this_mv.col);
-
-    this_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
-                                      ref_buf, ref_stride);
-    if (this_sad < best_sad) {
-      best_sad = this_sad;
-      tmp_mv->row = search_pos[idx].row + this_mv.row;
-      tmp_mv->col = search_pos[idx].col + this_mv.col;
-    }
-  }
-
-  tmp_mv->row *= 8;
-  tmp_mv->col *= 8;
-
-  x->pred_mv[LAST_FRAME] = *tmp_mv;
-}
-#endif
-
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for downs-sampled inputs.
+// reconstructed last, where variance is computed for down-sampled inputs.
 static void choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
                                 MACROBLOCK *x,
@@ -682,7 +536,6 @@ static void choose_partitioning(VP9_COMP *cpi,
   int sp;
   int dp;
   int pixels_wide = 64, pixels_high = 64;
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
 
   // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
@@ -709,7 +562,13 @@ static void choose_partitioning(VP9_COMP *cpi,
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
-    unsigned int var = 0, sse;
+    unsigned int uv_sad;
+#if GLOBAL_MOTION
+    unsigned int y_sad;
+    BLOCK_SIZE bsize;
+#endif
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    assert(yv12 != NULL);
     vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
         &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
@@ -719,7 +578,16 @@ static void choose_partitioning(VP9_COMP *cpi,
     mbmi->interp_filter = BILINEAR;
 
 #if GLOBAL_MOTION
-    motion_estimation(cpi, x, BLOCK_64X64);
+    if (mi_row + 4 < cm->mi_rows && mi_col + 4 < cm->mi_cols)
+      bsize = BLOCK_64X64;
+    else if (mi_row + 4 < cm->mi_rows && mi_col + 4 >= cm->mi_cols)
+      bsize = BLOCK_32X64;
+    else if (mi_row + 4 >= cm->mi_rows && mi_col + 4 < cm->mi_cols)
+      bsize = BLOCK_64X32;
+    else
+      bsize = BLOCK_32X32;
+
+    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
 #endif
 
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
@@ -728,10 +596,14 @@ static void choose_partitioning(VP9_COMP *cpi,
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
       const BLOCK_SIZE bs = get_plane_block_size(BLOCK_64X64, pd);
-      var += cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                                pd->dst.buf, pd->dst.stride, &sse);
-      if (sse > 2048)
-        x->color_sensitivity[i - 1] = 1;
+      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                   pd->dst.buf, pd->dst.stride);
+
+#if GLOBAL_MOTION
+      x->color_sensitivity[i - 1] = uv_sad * 4 > y_sad;
+#else
+      x->color_sensitivity[i - 1] = (uv_sad > 512);
+#endif
     }
 
     d = xd->plane[0].dst.buf;
@@ -3897,7 +3769,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   // Special case: set prev_mi to NULL when the previous mode info
   // context cannot be used.
   cm->prev_mi = cm->use_prev_frame_mvs ?
-                  cm->prev_mip + cm->mi_stride + 1 : NULL;
+                cm->prev_mip + cm->mi_stride + 1 : NULL;
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp9_zero(x->skip_txfm);
@@ -4169,6 +4041,7 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
                                                      mbmi->ref_frame[ref]);
+      assert(cfg != NULL);
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 70b804e31..65e299793 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -476,19 +476,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
@@ -508,19 +508,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
       break;
     case TX_16X16:
       vp9_fdct16x16_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 256, x->skip_block, p->round,
                      p->quant_fp[0], qcoeff, dqcoeff,
                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vp9_fdct8x8_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 64, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 16, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index bd7e0b64c..249c2363d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -483,6 +483,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
@@ -495,13 +496,6 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
                        "Failed to allocate altref buffer");
 }
 
-static void alloc_ref_frame_buffers(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffers");
-}
-
 static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
@@ -2483,6 +2477,21 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vp9_extend_frame_inner_borders(cm->frame_to_show);
 }
 
+static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+                                   int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL ||
+      new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    new_fb_ptr->mvs =
+      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                           sizeof(*new_fb_ptr->mvs));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
@@ -2491,13 +2500,19 @@ void vp9_scale_references(VP9_COMP *cpi) {
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
     if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
-      const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-       BufferPool *const pool = cm->buffer_pool;
-      const YV12_BUFFER_CONFIG *const ref = &pool->frame_bufs[idx].buf;
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                                 ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
 
 #if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
+        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
         cm->cur_frame = &pool->frame_bufs[new_fb];
         vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
                                  cm->width, cm->height,
@@ -2505,35 +2520,28 @@ void vp9_scale_references(VP9_COMP *cpi) {
                                  cm->use_highbitdepth,
                                  VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
                                  NULL, NULL, NULL);
-        scale_and_extend_frame(ref, &pool->frame_bufs[new_fb].buf,
-                               (int)cm->bit_depth);
+        scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
 #else
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
-        vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
+        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+        vp9_realloc_frame_buffer(&new_fb_ptr->buf,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
                                  VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
                                  NULL, NULL, NULL);
-        scale_and_extend_frame(ref, &pool->frame_bufs[new_fb].buf);
+        scale_and_extend_frame(ref, &new_fb_ptr->buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
-        if (pool->frame_bufs[new_fb].mvs == NULL ||
-            pool->frame_bufs[new_fb].mi_rows < cm->mi_rows ||
-            pool->frame_bufs[new_fb].mi_cols < cm->mi_cols) {
-          vpx_free(pool->frame_bufs[new_fb].mvs);
-          pool->frame_bufs[new_fb].mvs =
-            (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                 sizeof(*pool->frame_bufs[new_fb].mvs));
-          pool->frame_bufs[new_fb].mi_rows = cm->mi_rows;
-          pool->frame_bufs[new_fb].mi_cols = cm->mi_cols;
-        }
+
+        alloc_frame_mvs(cm, new_fb);
       } else {
-        cpi->scaled_ref_idx[ref_frame - 1] = idx;
-        ++pool->frame_bufs[idx].ref_count;
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++pool->frame_bufs[buf_idx].ref_count;
       }
     } else {
-      cpi->scaled_ref_idx[ref_frame - 1] = INVALID_REF_BUFFER_IDX;
+      cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
     }
   }
 }
@@ -2543,11 +2551,11 @@ static void release_scaled_references(VP9_COMP *cpi) {
   int i;
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
     const int idx = cpi->scaled_ref_idx[i];
-    RefCntBuffer *const buf = idx != INVALID_REF_BUFFER_IDX ?
+    RefCntBuffer *const buf = idx != INVALID_IDX ?
         &cm->buffer_pool->frame_bufs[idx] : NULL;
     if (buf != NULL) {
       --buf->ref_count;
-      cpi->scaled_ref_idx[i] = INVALID_REF_BUFFER_IDX;
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
     }
   }
 }
@@ -2751,6 +2759,8 @@ void set_frame_size(VP9_COMP *cpi) {
     vp9_set_target_rate(cpi);
   }
 
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
   // Reset the frame pointers to the current frame size.
   vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
@@ -2765,24 +2775,30 @@ void set_frame_size(VP9_COMP *cpi) {
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[idx].buf;
     RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
-    ref_buf->buf = buf;
-    ref_buf->idx = idx;
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
 #if CONFIG_VP9_HIGHBITDEPTH
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height,
-                                      (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
-                                          1 : 0);
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height,
+                                        (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                            1 : 0);
 #else
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height);
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf);
+      if (vp9_is_scaled(&ref_buf->sf))
+        vp9_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
   }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
@@ -3448,6 +3464,16 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
     vp9_twopass_postencode_update(cpi);
 }
 
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
 static void check_initial_width(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 int use_highbitdepth,
@@ -3468,7 +3494,7 @@ static void check_initial_width(VP9_COMP *cpi,
 #endif
 
     alloc_raw_frame_buffers(cpi);
-    alloc_ref_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
     alloc_util_frame_buffers(cpi);
 
     init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
@@ -3793,8 +3819,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
-  pool->frame_bufs[cm->new_fb_idx].ref_count--;
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
   cm->new_fb_idx = get_free_fb(cm);
+
+  if (cm->new_fb_idx == INVALID_IDX)
+    return -1;
+
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
@@ -3821,7 +3853,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   for (i = 0; i < MAX_REF_FRAMES; ++i)
-    cpi->scaled_ref_idx[i] = INVALID_REF_BUFFER_IDX;
+    cpi->scaled_ref_idx[i] = INVALID_IDX;
 
   if (oxcf->pass == 1 &&
       (!cpi->use_svc || is_two_pass_svc(cpi))) {
@@ -3907,8 +3939,18 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           PSNR_STATS psnr2;
           double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
-          // TODO(agrange) Add resizing of post-proc buffer in here when the
-          // encoder is changed to use on-demand buffer allocation.
+          if (vp9_alloc_frame_buffer(&cm->post_proc_buffer,
+                                     recon->y_crop_width, recon->y_crop_height,
+                                     cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
+                                     VP9_ENC_BORDER_IN_PIXELS,
+                                     cm->byte_alignment) < 0) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate post processing buffer");
+          }
+
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 69edfded1..c3679ca0b 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -17,6 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_thread_common.h"
@@ -47,7 +48,6 @@ extern "C" {
 #endif
 
 #define DEFAULT_GF_INTERVAL         10
-#define INVALID_REF_BUFFER_IDX      -1  // Marks an invalid reference buffer id.
 
 typedef struct {
   int nmvjointcost[MV_JOINTS];
@@ -517,8 +517,8 @@ static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
-static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
-                                    MV_REFERENCE_FRAME ref_frame) {
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
     return cpi->lst_fb_idx;
   } else if (ref_frame == GOLDEN_FRAME) {
@@ -528,12 +528,19 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
   }
 }
 
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+                                        int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
     VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
   VP9_COMMON *const cm = &cpi->common;
-  BufferPool *const pool = cm->buffer_pool;
-  return &pool->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
-      .buf;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index b813e7de0..6c8bbdb6d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -66,12 +66,6 @@
 unsigned int arf_count = 0;
 #endif
 
-static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
-  YV12_BUFFER_CONFIG temp = *a;
-  *a = *b;
-  *b = temp;
-}
-
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
 static void reset_fpf_position(TWO_PASS *p,
@@ -465,12 +459,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int i;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
-  int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height);
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
@@ -488,11 +476,22 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   MV lastmv = {0, 0};
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
   double intra_factor;
   double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -537,21 +536,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     }
 
     if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      BufferPool *const pool = cm->buffer_pool;
-      const int ref_idx =
-          cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
-      const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
-
-      gld_yv12 = (scaled_idx != ref_idx) ? &pool->frame_bufs[scaled_idx].buf :
-                 get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
     } else {
       gld_yv12 = NULL;
     }
 
-    recon_y_stride = new_yv12->y_stride;
-    recon_uv_stride = new_yv12->uv_stride;
-    uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
     set_ref_ptrs(cm, xd,
                  (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
                  (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
@@ -563,9 +555,12 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
 
+  if (!frame_is_intra_only(cm)) {
+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
   xd->mi = cm->mi;
   xd->mi[0].src_mi = &xd->mi[0];
 
@@ -585,6 +580,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     MV best_ref_mv = {0, 0};
 
@@ -1020,7 +1019,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-      vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
     }
     twopass->sr_update_lag = 1;
   } else {
@@ -1032,14 +1032,17 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   if (lc != NULL) {
     vp9_update_reference_frames(cpi);
   } else {
-    // Swap frame pointers so last frame refers to the frame we just compressed.
-    swap_yv12(lst_yv12, new_yv12);
+    // The frame we just compressed now becomes the last frame.
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+               cm->new_fb_idx);
   }
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
-    vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
+      lc == NULL) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 18a8c72c4..b3a8df924 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -376,6 +376,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
   YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
+  assert(golden_ref != NULL);
+
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
   if (n_frames <= cpi->rc.frames_till_gf_update_due)
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 159e0fc0c..c49a8bef3 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1714,6 +1714,152 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
   return bestsad;
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vp9_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[9] = {
+  {-1, -1}, {-1, 0}, {-1, 1}, {0, -1}, {0, 0}, {0, 1},
+  {1, -1}, {1, 0}, {1, 1},
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv;
+  int best_sad;
+  MV this_mv;
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vp9_int_pro_col(ref_buf, bw);
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw);
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  best_sad = INT_MAX;
+  this_mv = *tmp_mv;
+  for (idx = 0; idx < 9; ++idx) {
+    int this_sad;
+    src_buf = x->plane[0].src.buf;
+    ref_buf = xd->plane[0].pre[0].buf +
+        (search_pos[idx].row + this_mv.row) * ref_stride +
+        (search_pos[idx].col + this_mv.col);
+
+    this_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                      ref_buf, ref_stride);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+  x->pred_mv[LAST_FRAME] = *tmp_mv;
+
+  return best_sad;
+}
+
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index bba48fd6e..dd8a46079 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -72,7 +72,7 @@ int vp9_refining_search_sad(const struct macroblock *x,
                             const struct vp9_variance_vtable *fn_ptr,
                             const struct mv *center_mv);
 
-// Runs sequence of diamond searches in smaller steps for RD
+// Runs sequence of diamond searches in smaller steps for RD.
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
@@ -80,6 +80,11 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+                                           MACROBLOCK *x,
+                                           BLOCK_SIZE bsize);
+
 typedef int (integer_mv_pattern_search_fn) (
     const MACROBLOCK *x,
     MV *ref_mv,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1e88201be..2f9cccb99 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -689,12 +689,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 #endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+
     x->pred_mv_sad[ref_frame] = INT_MAX;
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
       int_mv *const candidates = mbmi->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
@@ -1076,11 +1077,11 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   ctx->pred_pixel_ready = 0;
 
   for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
       int_mv *const candidates = mbmi->ref_mvs[ref_frame];
       const struct scale_factors *const sf =
                              &cm->frame_refs[ref_frame - 1].sf;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7143987d4..2523d1ea3 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,7 +19,8 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
@@ -29,6 +30,9 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int tmp, eob = -1;
 
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
@@ -41,12 +45,16 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
                             const int16_t *round_ptr, const int16_t quant,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr) {
   int eob = -1;
 
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     const int rc = 0;
     const int coeff = coeff_ptr[rc];
@@ -69,15 +77,20 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int tmp, eob = -1;
 
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
 
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
@@ -96,8 +109,12 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr,
                                   uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
   int eob = -1;
 
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     const int rc = 0;
     const int coeff = coeff_ptr[rc];
@@ -105,8 +122,8 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
     const int64_t tmp =
-        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 15;
+        (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+               INT32_MIN, INT32_MAX) * quant) >> 15;
     qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
     if (tmp)
@@ -521,21 +538,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                        16, x->skip_block,
-                        p->zbin, p->round, p->quant, p->quant_shift,
-                        BLOCK_OFFSET(p->qcoeff, block),
-                        BLOCK_OFFSET(pd->dqcoeff, block),
-                        pd->dequant, &p->eobs[block],
-                        scan, iscan);
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, &p->eobs[block],
+                          scan, iscan);
     return;
   }
 #endif
   vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
-           16, x->skip_block,
-           p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(p->qcoeff, block),
-           BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, &p->eobs[block], scan, iscan);
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index de2839f5b..55e546944 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,7 +37,8 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -49,7 +50,8 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
                             const int16_t *round_ptr, const int16_t quant_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr);
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 0b6d11eca..5cc980cf7 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -532,13 +532,14 @@ int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
   return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
 }
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
-                                                   int ref_frame) {
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                             int ref_frame) {
   const VP9_COMMON *const cm = &cpi->common;
-  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  return (scaled_idx != ref_idx) ?
-      &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+          &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
 }
 
 int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 59a87cf98..b18a81659 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -147,8 +147,8 @@ int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
 int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
                                        int raster_block, int16_t *base);
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
-                                                   int ref_frame);
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+                                             int ref_frame);
 
 void vp9_init_me_luts();
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 53a5dba19..51397a791 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -328,12 +328,12 @@ static const int16_t band_counts[TX_SIZES][8] = {
   { 1, 2, 3, 4, 11,  256 - 21, 0 },
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
-static INLINE int cost_coeffs(MACROBLOCK *x,
-                              int plane, int block,
-                              ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                              TX_SIZE tx_size,
-                              const int16_t *scan, const int16_t *nb,
-                              int use_fast_coef_costing) {
+static int cost_coeffs(MACROBLOCK *x,
+                       int plane, int block,
+                       ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                       TX_SIZE tx_size,
+                       const int16_t *scan, const int16_t *nb,
+                       int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
@@ -1316,8 +1316,8 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memmove(&mic->bmi[i + idy * 2 + idx],
+                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
             thismvcost;
@@ -2022,6 +2022,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
+  assert(yv12 != NULL);
+
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -2912,6 +2914,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
@@ -4238,4 +4241,3 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
   store_coding_context(x, ctx, best_ref_index,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
 }
-
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 161b5a24d..bf9cad00a 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -91,8 +91,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       if (oxcf->ss_enable_auto_arf[layer])
         lc->alt_ref_idx = alt_ref_idx++;
       else
-        lc->alt_ref_idx = -1;
-      lc->gold_ref_idx = -1;
+        lc->alt_ref_idx = INVALID_IDX;
+      lc->gold_ref_idx = INVALID_IDX;
     }
 
     lrc->buffer_level = oxcf->starting_buffer_level_ms *
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 482fa3da3..f49949940 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -90,8 +90,16 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
   s0 = _mm_adds_epu16(s0, t0);
   s1 = _mm_adds_epu16(s1, t1);
 
-  s0 = _mm_srai_epi16(s0, 5);
-  s1 = _mm_srai_epi16(s1, 5);
+  if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
 
   _mm_store_si128((__m128i *)hbuf, s0);
   hbuf += 8;
@@ -104,6 +112,7 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
   __m128i s0 = _mm_sad_epu8(src_line, zero);
   __m128i s1;
   int i;
+  const int norm_factor = 3 + (width >> 5);
 
   for (i = 16; i < width; i += 16) {
     ref += 16;
@@ -115,7 +124,7 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
   s1 = _mm_srli_si128(s0, 8);
   s0 = _mm_adds_epu16(s0, s1);
 
-  return (_mm_extract_epi16(s0, 0)) >> 5;
+  return _mm_extract_epi16(s0, 0) >> norm_factor;
 }
 
 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 9bb880c7c..9e4c1a5c4 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -278,7 +278,7 @@ static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
     VP9_COMMON *const cm = &frame_worker_data->pbi->common;
     BufferPool *const pool = cm->buffer_pool;
 
-    cm->new_fb_idx = -1;
+    cm->new_fb_idx = INVALID_IDX;
     cm->byte_alignment = ctx->byte_alignment;
 
     if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
@@ -500,7 +500,6 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
 
     check_resync(ctx, frame_worker_data->pbi);
   } else {
-    const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
     VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     // Copy context from last worker thread to next worker thread.