15 files changed, 218 insertions, 69 deletions
diff --git a/examples.mk b/examples.mk
index f1cc42bf7..8426ee769 100644
--- a/examples.mk
+++ b/examples.mk
@@ -8,6 +8,12 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
+                third_party/libyuv/include/libyuv/cpu_id.h  \
+                third_party/libyuv/include/libyuv/scale.h  \
+                third_party/libyuv/source/row.h \
+                third_party/libyuv/source/scale.c  \
+                third_party/libyuv/source/cpu_id.c
 
 # List of examples to build. UTILS are files that are taken from the source
 # tree directly, and GEN_EXAMPLES are files that are created from the
@@ -25,6 +31,7 @@ vpxdec.SRCS                 += nestegg/halloc/src/hlist.h
 vpxdec.SRCS                 += nestegg/halloc/src/macros.h
 vpxdec.SRCS                 += nestegg/include/nestegg/nestegg.h
 vpxdec.SRCS                 += nestegg/src/nestegg.c
+vpxdec.SRCS                 += $(LIBYUV_SRCS)
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
@@ -36,6 +43,7 @@ vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += libmkv/EbmlIDs.h
 vpxenc.SRCS                 += libmkv/EbmlWriter.c
 vpxenc.SRCS                 += libmkv/EbmlWriter.h
+vpxenc.SRCS                 += $(LIBYUV_SRCS)
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
 UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
@@ -99,13 +107,7 @@ vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
 # C file is provided, not generated automatically.
 UTILS-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
-vp8_multi_resolution_encoder.SRCS  \
-                         += third_party/libyuv/include/libyuv/basic_types.h  \
-                            third_party/libyuv/include/libyuv/cpu_id.h  \
-                            third_party/libyuv/include/libyuv/scale.h  \
-                            third_party/libyuv/source/row.h \
-                            third_party/libyuv/source/scale.c  \
-                            third_party/libyuv/source/cpu_id.c
+vp8_multi_resolution_encoder.SRCS         += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding
 
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 2d128e155..3e0ee4b63 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -13,6 +13,13 @@
 
 #include "./vpx_config.h"
 
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/
+
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 632dae8fd..f34823b36 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -31,13 +31,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
-
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
-  return (value + (1 << (n - 1))) >> n;
-}*/
-
 typedef void (*transform_1d)(int16_t*, int16_t*);
 
 typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 700af7fa7..02a6711e5 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -296,7 +296,7 @@ specialize vp9_short_iht16x16
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add
+specialize vp9_dc_only_idct_add sse2
 
 prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_inv_walsh4x4_1_x8
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
new file mode 100644
index 000000000..667f5c1d3
--- /dev/null
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+// In order to improve performance, clip absolute diff values to [0, 255],
+// which allows to keep the additions/subtractions in 8 bits.
+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
+                               uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int16_t out;
+  uint8_t abs_diff;
+  __m128i p0, p1, p2, p3;
+  unsigned int extended_diff;
+  __m128i diff;
+
+  out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  // Read prediction data.
+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
+
+  // Unpack prediction data, and store 4x4 array in 1 XMM register.
+  p0 = _mm_unpacklo_epi32(p0, p1);
+  p2 = _mm_unpacklo_epi32(p2, p3);
+  p0 = _mm_unpacklo_epi64(p0, p2);
+
+  // Clip dc value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (a1 >= 0) {
+    abs_diff = (a1 > 255) ? 255 : a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_adds_epu8(p0, diff);
+  } else {
+    abs_diff = (a1 < -255) ? 255 : -a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_subs_epu8(p0, diff);
+  }
+
+  // Store results to dst.
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+}
+#endif
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 170202ba4..8e9e5ad7d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -126,7 +126,7 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {
   xd->inv_txm4x4_1      = vp9_short_idct4x4llm_1;
   xd->inv_txm4x4        = vp9_short_idct4x4llm;
   xd->itxm_add          = vp9_dequant_idct_add;
-  xd->dc_only_itxm_add  = vp9_dc_only_idct_add_c;
+  xd->dc_only_itxm_add  = vp9_dc_only_idct_add;
   xd->itxm_add_y_block  = vp9_dequant_idct_add_y_block;
   xd->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
   if (xd->lossless) {
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index c330bf97d..0ec5036e4 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -47,7 +47,7 @@ void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
       if (xd->eobs[i * 4 + j] > 1)
         vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);
       else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);
+        vp9_dc_only_idct_add(q[0]*dq[0], pre, dst, 16, stride);
         ((int *)q)[0] = 0;
       }
 
@@ -72,7 +72,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
       if (xd->eobs[16 + i * 2 + j] > 1)
         vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);
       else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);
+        vp9_dc_only_idct_add(q[0]*dq[0], pre, dstu, 8, stride);
         ((int *)q)[0] = 0;
       }
 
@@ -90,7 +90,7 @@ void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
       if (xd->eobs[20 + i * 2 + j] > 1)
         vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);
       else {
-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);
+        vp9_dc_only_idct_add(q[0]*dq[0], pre, dstv, 8, stride);
         ((int *)q)[0] = 0;
       }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eaed1a964..c0fe5ac76 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1187,7 +1187,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Copy data over into macro block data structures.
   x->src = *cpi->Source;
-  xd->pre = cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]];
+  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
   // set up frame for intra coded blocks
@@ -2089,11 +2089,11 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
@@ -2104,11 +2104,11 @@ static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
       int second_ref_fb_idx;
 
       if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
@@ -2319,11 +2319,11 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
@@ -2334,11 +2334,11 @@ static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
@@ -2548,11 +2548,11 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
     assert(cm->frame_type != KEY_FRAME);
 
     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
     else
-      ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
     setup_pred_block(&xd->pre,
                      &cpi->common.yv12_fb[ref_fb_idx],
@@ -2563,11 +2563,11 @@ static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
       int second_ref_fb_idx;
 
       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->lst_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->gld_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
       else
-        second_ref_fb_idx = cpi->common.active_ref_idx[cpi->alt_fb_idx];
+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
 
       setup_pred_block(&xd->second_pre,
                        &cpi->common.yv12_fb[second_ref_fb_idx],
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 05a0f6f04..4d0a299e8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -436,10 +436,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   int recon_yoffset, recon_uvoffset;
   YV12_BUFFER_CONFIG *lst_yv12 =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
   YV12_BUFFER_CONFIG *gld_yv12 =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
   int recon_y_stride = lst_yv12->y_stride;
   int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index bc06c9458..d6644c2aa 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -445,7 +445,7 @@ void vp9_update_mbgraph_stats
   VP9_COMMON *const cm = &cpi->common;
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
   YV12_BUFFER_CONFIG *golden_ref =
-      &cm->yv12_fb[cm->active_ref_idx[cpi->gld_fb_idx]];
+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
 
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 45ab6cd8c..ced6eddca 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -833,7 +833,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   }
 
   {
-    int y_stride = cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].y_stride;
+    int y_stride = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].y_stride;
 
     if (cpi->sf.search_method == NSTEP) {
       vp9_init3smotion_compensation(&cpi->mb, y_stride);
@@ -1754,7 +1754,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 #endif
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *lst_yv12 =
-            &cpi->common.yv12_fb[cpi->common.active_ref_idx[cpi->lst_fb_idx]];
+            &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];
         double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
@@ -2099,11 +2099,11 @@ int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->lst_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->gld_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->alt_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2120,11 +2120,11 @@ int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
   int ref_fb_idx;
 
   if (ref_frame_flag == VP9_LAST_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->lst_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];
   else if (ref_frame_flag == VP9_GOLD_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->gld_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];
   else if (ref_frame_flag == VP9_ALT_FLAG)
-    ref_fb_idx = cm->active_ref_idx[cpi->alt_fb_idx];
+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];
   else
     return -1;
 
@@ -2480,9 +2480,9 @@ static void update_reference_frames(VP9_COMP * const cpi) {
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->gld_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
   } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
     /* Preserve the previously existing golden frame and update the frame in
      * the alt ref slot instead. This is highly specific to the current use of
@@ -2496,7 +2496,7 @@ static void update_reference_frames(VP9_COMP * const cpi) {
     int tmp;
 
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
@@ -2504,18 +2504,18 @@ static void update_reference_frames(VP9_COMP * const cpi) {
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->active_ref_idx[cpi->alt_fb_idx], cm->new_fb_idx);
+                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
     }
 
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->active_ref_idx[cpi->gld_fb_idx], cm->new_fb_idx);
+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     }
   }
 
   if (cpi->refresh_last_frame) {
     ref_cnt_fb(cm->fb_idx_ref_cnt,
-               &cm->active_ref_idx[cpi->lst_fb_idx], cm->new_fb_idx);
+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
 }
 
@@ -2604,7 +2604,7 @@ static void scale_references(VP9_COMP *cpi) {
   int i;
 
   for (i = 0; i < 3; i++) {
-    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->active_ref_idx[i]];
+    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];
 
     if (ref->y_width != cm->Width || ref->y_height != cm->Height) {
       int new_fb = get_free_fb(cm);
@@ -2616,8 +2616,8 @@ static void scale_references(VP9_COMP *cpi) {
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
-      cpi->scaled_ref_idx[i] = cm->active_ref_idx[i];
-      cm->fb_idx_ref_cnt[cm->active_ref_idx[i]]++;
+      cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];
+      cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;
     }
   }
 }
@@ -3644,8 +3644,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     FILE *recon_file;
     sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
     recon_file = fopen(filename, "wb");
-    fwrite(cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].buffer_alloc,
-           cm->yv12_fb[cm->active_ref_idx[cpi->lst_fb_idx]].frame_size,
+    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
+           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
            1, recon_file);
     fclose(recon_file);
   }
@@ -3867,6 +3867,11 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
   cm->new_fb_idx = get_free_fb(cm);
 
+  /* Get the mapping of L/G/A to the reference buffer pool */
+  cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];
+  cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];
+  cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];
+
   /* Reset the frame pointers to the current frame size */
   vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
                                 cm->mb_cols * 16, cm->mb_rows * 16,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a2e6c34b5..496be950c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3111,7 +3111,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                YV12_BUFFER_CONFIG yv12_mb[4],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
-  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.active_ref_idx[idx]];
+  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;
@@ -4083,7 +4083,7 @@ static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         fb = cpi->alt_fb_idx;
       }
 
-      if (cpi->scaled_ref_idx[fb] != cm->active_ref_idx[fb])
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
         scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
     }
 
@@ -5176,7 +5176,7 @@ static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         fb = cpi->alt_fb_idx;
       }
 
-      if (cpi->scaled_ref_idx[fb] != cm->active_ref_idx[fb])
+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
         scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
 
 #if CONFIG_COMP_INTERINTRA_PRED
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index eb152f521..f330b464a 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -110,10 +110,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c
 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm
 endif
 
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c
 ifeq ($(HAVE_SSE2),yes)
+vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2
+vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2
 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2
 endif
diff --git a/vpxdec.c b/vpxdec.c
index f7281a4ef..30196ecc8 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -30,6 +30,7 @@
 #endif
 #include "tools_common.h"
 #include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/libyuv/include/libyuv/scale.h"
 
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
@@ -93,6 +94,8 @@ static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0,
                                             "Show version string");
 static const arg_def_t error_concealment = ARG_DEF(NULL, "error-concealment", 0,
                                                    "Enable decoder error-concealment");
+static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
+                                            "Scale output frames uniformly");
 
 
 #if CONFIG_MD5
@@ -102,7 +105,7 @@ static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg,
+  &threadsarg, &verbosearg, &scalearg,
 #if CONFIG_MD5
   &md5arg,
 #endif
@@ -708,6 +711,9 @@ int main(int argc, const char **argv_) {
   struct input_ctx        input = {0};
   int                     frames_corrupted = 0;
   int                     dec_flags = 0;
+  int                     do_scale;
+  int                     stream_w = 0, stream_h = 0;
+  vpx_image_t             *scaled_img = NULL;
 
   /* Parse command line */
   exec_name = argv_[0];
@@ -757,6 +763,8 @@ int main(int argc, const char **argv_) {
       cfg.threads = arg_parse_uint(&arg);
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
+    else if (arg_match(&arg, &scalearg, argi))
+      do_scale = 1;
 
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -1015,6 +1023,30 @@ int main(int argc, const char **argv_) {
       show_progress(frame_in, frame_out, dx_time);
 
     if (!noblit) {
+      if (do_scale) {
+        if (frame_out == 1) {
+          stream_w = img->d_w;
+          stream_h = img->d_h;
+          scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
+                                     stream_w, stream_h, 16);
+        }
+        if (img && (img->d_w != stream_w || img->d_h != stream_h)) {
+          I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
+                    img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
+                    img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
+                    img->d_w, img->d_h,
+                    scaled_img->planes[VPX_PLANE_Y],
+                    scaled_img->stride[VPX_PLANE_Y],
+                    scaled_img->planes[VPX_PLANE_U],
+                    scaled_img->stride[VPX_PLANE_U],
+                    scaled_img->planes[VPX_PLANE_V],
+                    scaled_img->stride[VPX_PLANE_V],
+                    stream_w, stream_h,
+                    kFilterBox);
+          img = scaled_img;
+        }
+      }
+
       if (img) {
         unsigned int y;
         char out_fn[PATH_MAX];
diff --git a/vpxenc.c b/vpxenc.c
index 2f3b6356d..7597e3cbb 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -47,6 +47,7 @@
 #include "y4minput.h"
 #include "libmkv/EbmlWriter.h"
 #include "libmkv/EbmlIDs.h"
+#include "third_party/libyuv/include/libyuv/scale.h"
 
 /* Need special handling of these functions on Windows */
 #if defined(_MSC_VER)
@@ -1642,6 +1643,7 @@ struct stream_state {
   uint64_t                  cx_time;
   size_t                    nbytes;
   stats_io_t                stats;
+  struct vpx_image         *img;
   vpx_codec_ctx_t           decoder;
   vpx_ref_frame_t           ref_enc;
   vpx_ref_frame_t           ref_dec;
@@ -2061,11 +2063,15 @@ static void validate_stream_config(struct stream_state *stream) {
 static void set_stream_dimensions(struct stream_state *stream,
                                   unsigned int w,
                                   unsigned int h) {
-  if ((stream->config.cfg.g_w && stream->config.cfg.g_w != w)
-      || (stream->config.cfg.g_h && stream->config.cfg.g_h != h))
-    fatal("Stream %d: Resizing not yet supported", stream->index);
-  stream->config.cfg.g_w = w;
-  stream->config.cfg.g_h = h;
+  if (!stream->config.cfg.g_w) {
+    if (!stream->config.cfg.g_h)
+      stream->config.cfg.g_w = w;
+    else
+      stream->config.cfg.g_w = w * stream->config.cfg.g_h / h;
+  }
+  if (!stream->config.cfg.g_h) {
+    stream->config.cfg.g_h = h * stream->config.cfg.g_w / w;
+  }
 }
 
 
@@ -2258,6 +2264,28 @@ static void encode_frame(struct stream_state  *stream,
   next_frame_start = (cfg->g_timebase.den * (int64_t)(frames_in)
                       * global->framerate.den)
                      / cfg->g_timebase.num / global->framerate.num;
+
+  /* Scale if necessary */
+  if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+    if (!stream->img)
+      stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
+                                  cfg->g_w, cfg->g_h, 16);
+    I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
+              img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
+              img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
+              img->d_w, img->d_h,
+              stream->img->planes[VPX_PLANE_Y],
+              stream->img->stride[VPX_PLANE_Y],
+              stream->img->planes[VPX_PLANE_U],
+              stream->img->stride[VPX_PLANE_U],
+              stream->img->planes[VPX_PLANE_V],
+              stream->img->stride[VPX_PLANE_V],
+              stream->img->d_w, stream->img->d_h,
+              kFilterBox);
+
+    img = stream->img;
+  }
+
   vpx_usec_timer_start(&timer);
   vpx_codec_encode(&stream->encoder, img, frame_start,
                    (unsigned long)(next_frame_start - frame_start),
@@ -2518,6 +2546,9 @@ int main(int argc, const char **argv_) {
     });
 
     /* Update stream configurations from the input file's parameters */
+    if (!input.w || !input.h)
+      fatal("Specify stream dimensions with --width (-w) "
+            " and --height (-h)");
     FOREACH_STREAM(set_stream_dimensions(stream, input.w, input.h));
     FOREACH_STREAM(validate_stream_config(stream));