59 files changed, 1646 insertions, 2118 deletions
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index 2967b5aed..ed037132a 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -209,6 +209,10 @@ common_top() {
 #define RTCD_EXTERN extern
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 $(process_forward_decls)
 
 $(declare_function_pointers c $ALL_ARCHS)
@@ -219,6 +223,11 @@ EOF
 
 common_bottom() {
   cat <<EOF
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
 #endif
 EOF
 }
diff --git a/example_xma.c b/example_xma.c
deleted file mode 100644
index 7aa879810..000000000
--- a/example_xma.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This is a simple program showing how to initialize the decoder in XMA mode */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx_config.h"
-#include "vpx/vpx_decoder.h"
-#include "vpx/vpx_integer.h"
-#if CONFIG_VP9_DECODER
-#include "vpx/vp8dx.h"
-#endif
-
-static char *exec_name;
-static int   verbose = 0;
-
-static const struct {
-  const char *name;
-  const vpx_codec_iface_t *iface;
-} ifaces[] = {
-#if CONFIG_VP9_DECODER
-  {"vp9",  &vpx_codec_vp8_dx_algo},
-#endif
-};
-
-static void usage_exit(void) {
-  int i;
-
-  printf("Usage: %s <options>\n\n"
-         "Options:\n"
-         "\t--codec <name>\tCodec to use (default=%s)\n"
-         "\t-h <height>\tHeight of the simulated video frame, in pixels\n"
-         "\t-w <width> \tWidth of the simulated video frame, in pixels\n"
-         "\t-v         \tVerbose mode (show individual segment sizes)\n"
-         "\t--help     \tShow this message\n"
-         "\n"
-         "Included decoders:\n"
-         "\n",
-         exec_name,
-         ifaces[0].name);
-
-  for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    printf("    %-6s - %s\n",
-           ifaces[i].name,
-           vpx_codec_iface_name(ifaces[i].iface));
-
-  exit(EXIT_FAILURE);
-}
-
-static void usage_error(const char *fmt, ...) {
-  va_list ap;
-  va_start(ap, fmt);
-  vprintf(fmt, ap);
-  printf("\n");
-  usage_exit();
-}
-
-void my_mem_dtor(vpx_codec_mmap_t *mmap) {
-  if (verbose)
-    printf("freeing segment %d\n", mmap->id);
-
-  free(mmap->priv);
-}
-
-int main(int argc, char **argv) {
-  vpx_codec_ctx_t           decoder;
-  vpx_codec_iface_t        *iface = ifaces[0].iface;
-  vpx_codec_iter_t          iter;
-  vpx_codec_dec_cfg_t       cfg;
-  vpx_codec_err_t           res = VPX_CODEC_OK;
-  unsigned int            alloc_sz = 0;
-  unsigned int            w = 352;
-  unsigned int            h = 288;
-  int                     i;
-
-  exec_name = argv[0];
-
-  for (i = 1; i < argc; i++) {
-    if (!strcmp(argv[i], "--codec")) {
-      if (i + 1 < argc) {
-        int j, k = -1;
-
-        i++;
-
-        for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++)
-          if (!strcmp(ifaces[j].name, argv[i]))
-            k = j;
-
-        if (k >= 0)
-          iface = ifaces[k].iface;
-        else
-          usage_error("Error: Unrecognized argument (%s) to --codec\n",
-                      argv[i]);
-      } else
-        usage_error("Error: Option --codec requires argument.\n");
-    } else if (!strcmp(argv[i], "-v"))
-      verbose = 1;
-    else if (!strcmp(argv[i], "-h"))
-      if (i + 1 < argc) {
-        h = atoi(argv[++i]);
-      } else
-        usage_error("Error: Option -h requires argument.\n");
-    else if (!strcmp(argv[i], "-w"))
-      if (i + 1 < argc) {
-        w = atoi(argv[++i]);
-      } else
-        usage_error("Error: Option -w requires argument.\n");
-    else if (!strcmp(argv[i], "--help"))
-      usage_exit();
-    else
-      usage_error("Error: Unrecognized option %s\n\n", argv[i]);
-  }
-
-  if (argc == 1)
-    printf("Using built-in defaults. For options, rerun with --help\n\n");
-
-  /* XMA mode is not supported on all decoders! */
-  if (!(vpx_codec_get_caps(iface) & VPX_CODEC_CAP_XMA)) {
-    printf("%s does not support XMA mode!\n", vpx_codec_iface_name(iface));
-    return EXIT_FAILURE;
-  }
-
-  /* The codec knows how much memory to allocate based on the size of the
-   * encoded frames. This data can be parsed from the bitstream with
-   * vpx_codec_peek_stream_info() if a bitstream is available. Otherwise,
-   * a fixed size can be used that will be the upper limit on the frame
-   * size the decoder can decode.
-   */
-  cfg.w = w;
-  cfg.h = h;
-
-  /* Initialize the decoder in XMA mode. */
-  if (vpx_codec_dec_init(&decoder, iface, &cfg, VPX_CODEC_USE_XMA)) {
-    printf("Failed to initialize decoder in XMA mode: %s\n", vpx_codec_error(&decoder));
-    return EXIT_FAILURE;
-  }
-
-  /* Iterate through the list of memory maps, allocating them with the
-   * requested alignment.
-   */
-  iter = NULL;
-
-  do {
-    vpx_codec_mmap_t  mmap;
-    unsigned int    align;
-
-    res = vpx_codec_get_mem_map(&decoder, &mmap, &iter);
-    align = mmap.align ? mmap.align - 1 : 0;
-
-    if (!res) {
-      if (verbose)
-        printf("Allocating segment %u, size %lu, align %u %s\n",
-               mmap.id, mmap.sz, mmap.align,
-               mmap.flags & VPX_CODEC_MEM_ZERO ? "(ZEROED)" : "");
-
-      if (mmap.flags & VPX_CODEC_MEM_ZERO)
-        mmap.priv = calloc(1, mmap.sz + align);
-      else
-        mmap.priv = malloc(mmap.sz + align);
-
-      mmap.base = (void *)((((uintptr_t)mmap.priv) + align) & ~(uintptr_t)align);
-      mmap.dtor = my_mem_dtor;
-      alloc_sz += mmap.sz + align;
-
-      if (vpx_codec_set_mem_map(&decoder, &mmap, 1)) {
-        printf("Failed to set mmap: %s\n", vpx_codec_error(&decoder));
-        return EXIT_FAILURE;
-      }
-    } else if (res != VPX_CODEC_LIST_END) {
-      printf("Failed to get mmap: %s\n", vpx_codec_error(&decoder));
-      return EXIT_FAILURE;
-    }
-  } while (res != VPX_CODEC_LIST_END);
-
-  printf("%s\n    %d bytes external memory required for %dx%d.\n",
-         decoder.name, alloc_sz, cfg.w, cfg.h);
-  vpx_codec_destroy(&decoder);
-  return EXIT_SUCCESS;
-
-}
diff --git a/examples.mk b/examples.mk
index 66b719ca0..b29ab9c34 100644
--- a/examples.mk
+++ b/examples.mk
@@ -26,6 +26,7 @@ vpxdec.SRCS                 += args.c args.h
 vpxdec.SRCS                 += ivfdec.c ivfdec.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += webmdec.c webmdec.h
+vpxdec.SRCS                 += y4menc.c y4menc.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -109,11 +110,13 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
-# C file is provided, not generated automatically.
-UTILS-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c
-vp8_multi_resolution_encoder.SRCS         += $(LIBYUV_SRCS)
-vp8_multi_resolution_encoder.GUID         = 04f8738e-63c8-423b-90fa-7c2703a374de
-vp8_multi_resolution_encoder.DESCRIPTION  = VP8 Multiple-resolution Encoding
+
+ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
+GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
+vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
+vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
+endif
 
 # Handle extra library flags depending on codec configuration
 
diff --git a/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 4c29056e5..4c29056e5 100644
--- a/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 2ca6ff086..c060e86dc 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -10,7 +10,6 @@
 #ifndef TEST_CODEC_FACTORY_H_
 #define TEST_CODEC_FACTORY_H_
 
-extern "C" {
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vpx_encoder.h"
@@ -20,7 +19,6 @@ extern "C" {
 #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
-}
 
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 0b4ddaece..db7dfdb53 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -7,11 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 
 namespace {
 
@@ -286,6 +288,37 @@ TEST_P(DatarateTestVP9, BasicRateTargeting) {
   }
 }
 
+#if CONFIG_NON420
+// Check basic rate targeting,
+TEST_P(DatarateTestVP9, BasicRateTargeting444) {
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+
+  for (int i = 250; i < 900; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " "<< effective_datarate_;
+  }
+}
+#endif
+
 // Check that (1) the first dropped frame gets earlier and earlier
 // as the drop frame threshold is increased, and (2) that the total number of
 // frame drops does not decrease as we increase frame drop threshold.
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index d8c61ffb2..5f4c33a81 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -14,9 +14,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-extern "C" {
 #include "./vp9_rtcd.h"
-}
 
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
diff --git a/test/idct_test.cc b/test/idct_test.cc
index 2c7fa0ef8..1bbf80a0a 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -8,10 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
-}
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index e5ac9db2b..ff7bb08e3 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -10,12 +10,10 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
-}
 
 typedef void (*post_proc_func_t)(unsigned char *src_ptr,
                                  unsigned char *dst_ptr,
diff --git a/test/resize_test.cc b/test/resize_test.cc
index e8c2c825b..1963453fd 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -208,7 +208,7 @@ class ResizeInternalTest : public ResizeTest {
   virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
     if (!frame0_psnr_)
       frame0_psnr_ = pkt->data.psnr.psnr[0];
-    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 453b3a84e..4a91b0b60 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -13,7 +13,6 @@
 #include <limits.h>
 #include <stdio.h>
 
-extern "C" {
 #include "./vpx_config.h"
 #if CONFIG_VP8_ENCODER
 #include "./vp8_rtcd.h"
@@ -22,7 +21,6 @@ extern "C" {
 #include "./vp9_rtcd.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
-}
 
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 0f5c0a5e8..3434662fb 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -16,12 +16,10 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-extern "C" {
 #include "./vpx_config.h"
 #include "./vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
-}
 
 namespace {
 
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 492705948..6daf69e63 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,5 +1,6 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419  rush_hour_444.y4m
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
 906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf
diff --git a/test/test.mk b/test/test.mk
index 5a1d39de5..178b16210 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -24,6 +24,8 @@ LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
@@ -118,6 +120,7 @@ endif
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 6d93bb88f..4adf9af91 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -19,9 +19,7 @@
 #include "test/test_vectors.h"
 #include "test/util.h"
 #include "test/webm_video_source.h"
-extern "C" {
 #include "vpx_mem/vpx_mem.h"
-}
 
 namespace {
 
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 863a3669a..d7144522b 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -17,9 +17,7 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/md5_helper.h"
-extern "C" {
 #include "vpx_mem/vpx_mem.h"
-}
 
 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 25465c53c..e3c292ea1 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -15,9 +15,7 @@
 #include <string.h>
 #include <sys/types.h>
 
-extern "C" {
 #include "./vp8_rtcd.h"
-}
 
 #include "test/acm_random.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 03b89f8df..2282687dc 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -7,12 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
+#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "test/y4m_video_source.h"
 
 namespace {
 
@@ -71,5 +72,25 @@ TEST_P(LossLessTest, TestLossLessEncoding) {
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
+
+#if CONFIG_NON420
+TEST_P(LossLessTest, TestLossLessEncoding444) {
+  libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+#endif
+
 VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES);
 }  // namespace
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
new file mode 100644
index 000000000..bd86c2c04
--- /dev/null
+++ b/test/y4m_video_source.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_Y4M_VIDEO_SOURCE_H_
+#define TEST_Y4M_VIDEO_SOURCE_H_
+#include <string>
+
+#include "test/video_source.h"
+extern "C" {
+#include "./y4minput.h"
+}
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+  Y4mVideoSource(const std::string &file_name,
+                  unsigned int start, int limit)
+      : file_name_(file_name),
+        input_file_(NULL),
+        img_(new vpx_image_t()),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        framerate_numerator_(0),
+        framerate_denominator_(0),
+        y4m_() {
+  }
+
+  virtual ~Y4mVideoSource() {
+    vpx_img_free(img_.get());
+    y4m_input_close(&y4m_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_)
+      fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
+        << file_name_;
+
+    y4m_input_open(&y4m_, input_file_, NULL, 0, 0);
+    framerate_numerator_ = y4m_.fps_n;
+    framerate_denominator_ = y4m_.fps_d;
+
+    frame_ = 0;
+    for (unsigned int i = 0; i < start_; i++) {
+        Next();
+    }
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_.get() : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  testing::internal::scoped_ptr<vpx_image_t> img_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  y4m_input y4m_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 209a25d82..119e40cdc 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -39,8 +39,8 @@ extern "C"
 
     typedef enum
     {
-        USAGE_STREAM_FROM_SERVER    = 0x0,
-        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x0,
+        USAGE_STREAM_FROM_SERVER    = 0x1,
         USAGE_CONSTRAINED_QUALITY   = 0x2,
         USAGE_CONSTANT_QUALITY      = 0x3
     } END_USAGE;
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index d517dfa37..3f8539759 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -119,8 +119,10 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
                                                      v_abs_adjustment);
         v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
         v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
-        v_sum_diff = vqaddq_s8(v_sum_diff, (int8x16_t)v_pos_adjustment);
-        v_sum_diff = vqsubq_s8(v_sum_diff, (int8x16_t)v_neg_adjustment);
+        v_sum_diff = vqaddq_s8(v_sum_diff,
+                               vreinterpretq_s8_u8(v_pos_adjustment));
+        v_sum_diff = vqsubq_s8(v_sum_diff,
+                               vreinterpretq_s8_u8(v_neg_adjustment));
 
         /* Store results. */
         vst1q_u8(running_avg_y, v_running_avg_y);
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm
index 98619bb30..71bf24c9f 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm
@@ -17,6 +17,7 @@
     EXPORT  |vp9_h_predictor_16x16_neon|
     EXPORT  |vp9_h_predictor_32x32_neon|
     EXPORT  |vp9_tm_predictor_4x4_neon|
+    EXPORT  |vp9_tm_predictor_8x8_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -328,8 +329,78 @@ loop_h
     vqshrun.s16         d1, q2, #0
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
-
     bx                  lr
     ENDP                ; |vp9_tm_predictor_4x4_neon|
 
+;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
+;                                const uint8_t *above,
+;                                const uint8_t *left)
+; r0  uint8_t *dst
+; r1  ptrdiff_t y_stride
+; r2  const uint8_t *above
+; r3  const uint8_t *left
+
+|vp9_tm_predictor_8x8_neon| PROC
+    ; Load ytop_left = above[-1];
+    sub                 r12, r2, #1
+    ldrb                r12, [r12]
+    vdup.u8             d0, r12
+
+    ; Load above 8 pixels
+    vld1.64             {d2}, [r2]
+
+    ; Compute above - ytop_left
+    vsubl.u8            q3, d2, d0
+
+    ; Load left row by row and compute left + (above - ytop_left)
+    ; 1st row and 2nd row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 3rd row and 4th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 5th row and 6th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+
+    ; 7rd row and 8th row
+    ldrb                r12, [r3], #1
+    ldrb                r2, [r3], #1
+    vdup.u16            q1, r12
+    vdup.u16            q2, r2
+    vadd.s16            q1, q1, q3
+    vadd.s16            q2, q2, q3
+    vqshrun.s16         d0, q1, #0
+    vqshrun.s16         d1, q2, #0
+    vst1.64             {d0}, [r0], r1
+    vst1.64             {d1}, [r0], r1
+    bx                  lr
+    ENDP                ; |vp9_tm_predictor_8x8_neon|
+
     END
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ff20553d6..ca42090c1 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -85,7 +85,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int mi_size;
 
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
@@ -154,7 +154,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   for (i = 0; i < cm->fb_count; i++) {
     cm->fb_idx_ref_cnt[i] = 0;
     if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
-                               VP9BORDERINPIXELS) < 0)
+                               VP9_ENC_BORDER_IN_PIXELS) < 0)
       goto fail;
   }
 
@@ -167,7 +167,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   }
 
   if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-                             VP9BORDERINPIXELS) < 0)
+                             VP9_ENC_BORDER_IN_PIXELS) < 0)
     goto fail;
 
   set_mb_mi(cm, aligned_width, aligned_height);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 21e2b16a4..ad78b0dc4 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -122,7 +122,6 @@ typedef struct {
   TX_SIZE tx_size;
   int_mv mv[2];                // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  int_mv best_mv[2];
 
   uint8_t mode_context[MAX_REF_FRAMES];
 
@@ -242,6 +241,9 @@ typedef struct macroblockd {
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  /* mc buffer */
+  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+
   int lossless;
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index f43a85f14..ba162fd20 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -112,8 +112,8 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 // This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern const uint8_t vp9_coefband_trans_8x8plus[1024];
-extern const uint8_t vp9_coefband_trans_4x4[16];
+extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]);
+extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]);
 
 static const uint8_t *get_band_translate(TX_SIZE tx_size) {
   return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 06adbabaa..cd89390d5 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -32,8 +32,10 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                        mv_ref_list, -1, mi_row, mi_col);
 }
 
-#define LEFT_TOP_MARGIN     ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
+#define LEFT_TOP_MARGIN     ((VP9_ENC_BORDER_IN_PIXELS  \
+                            - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS  \
+                            - VP9_INTERP_EXTEND) << 3)
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 397f446f3..b5a9248c3 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -20,15 +20,16 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
-                             int x, int y, int b_w, int b_h, int w, int h) {
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
   // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * stride;
+  const uint8_t *ref_row = src - x - y * src_stride;
 
   if (y >= h)
-    ref_row += (h - 1) * stride;
+    ref_row += (h - 1) * src_stride;
   else if (y > 0)
-    ref_row += y * stride;
+    ref_row += y * src_stride;
 
   do {
     int right = 0, copy;
@@ -49,16 +50,16 @@ static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
       memset(dst, ref_row[0], left);
 
     if (copy)
-      memmove(dst + left, ref_row + x + left, copy);
+      memcpy(dst + left, ref_row + x + left, copy);
 
     if (right)
       memset(dst + left + copy, ref_row[w - 1], right);
 
-    dst += stride;
+    dst += dst_stride;
     ++y;
 
     if (y > 0 && y < h)
-      ref_row += stride;
+      ref_row += src_stride;
   } while (--b_h);
 }
 
@@ -281,7 +282,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
     MV32 scaled_mv;
     int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
-        frame_height, subpel_x, subpel_y;
+        frame_height, subpel_x, subpel_y, buf_stride;
     uint8_t *ref_frame, *buf_ptr;
     const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
 
@@ -308,7 +309,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
-      // Get block position in the scaled reference frame.
+      // Map the top left corner of the block into the reference frame.
       x0 = sf->scale_value_x(x0, sf);
       y0 = sf->scale_value_y(y0, sf);
       x0_16 = sf->scale_value_x(x0_16, sf);
@@ -321,7 +322,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Get reference block top left coordinate.
+    // Calculate the top left corner of the best matching block in the reference frame.
     x0 += scaled_mv.col >> SUBPEL_BITS;
     y0 += scaled_mv.row >> SUBPEL_BITS;
     x0_16 += scaled_mv.col;
@@ -329,24 +330,28 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
 
     // Get reference block bottom right coordinate.
     x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    y1 = ((y0_16 + (h - 1) * xs) >> SUBPEL_BITS) + 1;
+    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
 
     // Get reference block pointer.
     buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+    buf_stride = pre_buf->stride;
 
-    // Do border extension if there is motion or
+    // Do border extension if there is motion or the
     // width/height is not a multiple of 8 pixels.
     if (scaled_mv.col || scaled_mv.row ||
         (frame_width & 0x7) || (frame_height & 0x7)) {
+      int x_pad = 0, y_pad = 0;
 
-      if (subpel_x) {
+      if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) {
         x0 -= VP9_INTERP_EXTEND - 1;
         x1 += VP9_INTERP_EXTEND;
+        x_pad = 1;
       }
 
-      if (subpel_y) {
+      if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) {
         y0 -= VP9_INTERP_EXTEND - 1;
         y1 += VP9_INTERP_EXTEND;
+        y_pad = 1;
       }
 
       // Skip border extension if block is inside the frame.
@@ -354,12 +359,14 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
           y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
         uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
         // Extend the border.
-        build_mc_border(buf_ptr1, buf_ptr1, pre_buf->stride, x0, y0, x1 - x0,
-                        y1 - y0, frame_width, frame_height);
+        build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0,
+                        x0, y0, x1 - x0, y1 - y0, frame_width, frame_height);
+        buf_stride = x1 - x0;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
       }
     }
 
-    inter_predictor(buf_ptr, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, &xd->subpix, xs, ys);
   }
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 8f858f47c..caa6947b3 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -96,7 +96,7 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint
 specialize vp9_v_predictor_8x8 $sse_x86inc neon
 
 prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2
+specialize vp9_tm_predictor_8x8 $sse2_x86inc neon dspr2
 
 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
@@ -742,7 +742,7 @@ specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
@@ -756,9 +756,5 @@ specialize vp9_full_range_search
 prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2
 
-prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
-specialize vp9_yv12_copy_partial_frame
-
-
 fi
 # end encoder functions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f4f758297..f95423678 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -23,105 +23,20 @@ typedef void filter8_1dfunction (
   const short *filter
 );
 
-#if (HAVE_SSSE3)
+#if HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-#if (ARCH_X86_64)
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  /* Ensure the filter can be compressed to int16_t. */
-  if (x_step_q4 == 16 && filter_x[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_x);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_x);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
-                          filter_x, x_step_q4, filter_y, y_step_q4,
-                          w, h);
-  }
-}
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  if (y_step_q4 == 16 && filter_y[3] != 128) {
-    while (w >= 16) {
-      vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      src += 16;
-      dst += 16;
-      w -= 16;
-    }
-    while (w >= 8) {
-      vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    }
-    while (w >= 4) {
-      vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      src += 4;
-      dst += 4;
-      w -= 4;
-    }
-  }
-  if (w) {
-    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
-                         filter_x, x_step_q4, filter_y, y_step_q4,
-                         w, h);
-  }
-}
-
-#else
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                          w, h);
   }
 }
-#endif
 
 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 303fced3b..000000000
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <tmmintrin.h>
-#include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, const unsigned char,
-filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12};
-
-DECLARE_ALIGNED(16, const unsigned char,
-filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14};
-
-
-
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, srcReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the third 16 bit in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the seconds 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    // loading the local filters
-    thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8);
-    forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // extract the higher half of the lane
-        srcRegFilt3 =  _mm_srli_si128(srcRegFilt1, 8);
-        srcRegFilt4 =  _mm_srli_si128(srcRegFilt2, 8);
-
-        minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-        // add and saturate all the results together
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save only 4 bytes
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    __m128i addFilterReg64, filtersReg, minReg;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg);
-        srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-        // add and saturate all the results together
-        minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bits
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pixels_per_line;
-
-       // save only 8 bytes
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
-    __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits (first and second byte)
-    // across 128 bit register
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits (third and forth byte)
-    // across 128 bit register
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits (fifth and sixth byte)
-    // across 128 bit register
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits (seventh and eighth byte)
-    // across 128 bit register
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-    filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-    filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-    filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-    for (i = 0; i < output_height; i++) {
-        srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
-
-        // filter the source buffer
-        srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-
-        // reading the next 16 bytes.
-        // (part of it was being read by earlier read)
-        srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
-
-        // add and saturate the results together
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        // filter the source buffer
-        srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-        // filter the source buffer
-        srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg);
-        srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-
-        // add and saturate the results together
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
-        _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-        srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
-
-        srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-        srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-        src_ptr+=src_pixels_per_line;
-
-        // save 16 bytes
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
-
-        output_ptr+=output_pitch;
-    }
-}
-
-
-
-void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, firstFilters, secondFilters;
-    __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter into the first lane
-    firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-    // duplicate only the second 16 bits in the filter into the second lane
-    firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-    // duplicate only the third 16 bits in the filter into the first lane
-    secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-    // duplicate only the forth 16 bits in the filter into the second lane
-    secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 4 byte
-        srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0]));
-        // load the next 4 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0]));
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-
-
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0]));
-        srcRegFilt3 =  _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0]));
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-
-        srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0]));
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0]));
-
-        // merge the result together
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2);
-
-        srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0]));
-        srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0]));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-
-        // extract the second lane of the 128 bit register
-        srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_srli_si128(srcRegFilt3, 8));
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 4 bytes convolve result
-        *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 8 bytes
-        srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-        srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
-
-        // merge the result together
-        srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-        // load the next 8 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-        srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-        srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-        srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
-
-        // merge the result together
-        srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-        srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-        // add and saturate the results together
-        minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-        srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save only 8 bytes convolve result
-        _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
-
-
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
-    __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
-    __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-    __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    unsigned int i;
-
-    // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-    addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-    filtersReg = _mm_loadu_si128((__m128i *)filter);
-    // converting the 16 bit (short) to  8 bit (byte) and have the same data
-    // in both lanes of 128 bit register.
-    filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
-
-    // duplicate only the first 16 bits in the filter
-    firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-    // duplicate only the second 16 bits in the filter
-    secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-    // duplicate only the third 16 bits in the filter
-    thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-    // duplicate only the forth 16 bits in the filter
-    forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-
-    for (i = 0; i < output_height; i++) {
-        // load the first 16 bytes
-        srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-        // load the next 16 bytes in stride of src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-        srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
-
-        // merge the result together
-        srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-        srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-        srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
-        srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-        srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-
-        // load the next 16 bytes in stride of two/three src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
-        // merge the result together
-        srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
-        srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
-
-        // load the next 16 bytes in stride of four/five src_pitch
-        srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-        srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
-        // merge the result together
-        srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-        srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
-
-        // multiply 2 adjacent elements with the filter and add the result
-        srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
-        srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_min_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_min_epi16(srcRegFilt6, srcRegFilt8));
-
-
-        // add and saturate the results together
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-        _mm_max_epi16(srcRegFilt4, srcRegFilt7));
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
-        _mm_max_epi16(srcRegFilt6, srcRegFilt8));
-        srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
-        srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-        // shift by 7 bit each 16 bit
-        srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
-        srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-        // shrink to 8 bit each 16 bits, the first lane contain the first
-        // convolve result and the second lane contain the second convolve
-        // result
-        srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);
-
-        src_ptr+=src_pitch;
-
-        // save 16 bytes convolve result
-        _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
-
-        output_ptr+=out_pitch;
-    }
-}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 59faccdf7..d66ee2730 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -704,7 +704,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
     if (vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
-                                 VP9BORDERINPIXELS, ext_fb,
+                                 VP9_DEC_BORDER_IN_PIXELS, ext_fb,
                                  cm->realloc_fb_cb, cm->user_priv)) {
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to allocate external frame buffer");
@@ -712,7 +712,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
   } else {
     vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
-                             VP9BORDERINPIXELS, NULL, NULL, NULL);
+                             VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL);
   }
 }
 
@@ -1129,11 +1129,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
 
   cm->show_existing_frame = vp9_rb_read_bit(rb);
   if (cm->show_existing_frame) {
-    // show an existing frame directly
+    // Show an existing frame directly.
     int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
     ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
+    cm->show_frame = 1;
     return 0;
   }
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index c81378153..2eb99ea15 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -357,9 +357,9 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
 }
 
 static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
-                             int_mv mv[2], int_mv best_mv[2],
-                             int_mv nearest_mv[2], int_mv near_mv[2],
-                             int is_compound, int allow_hp, vp9_reader *r) {
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int is_compound, int allow_hp, vp9_reader *r) {
   int i;
   int ret = 1;
 
@@ -367,10 +367,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
                                             NULL : &cm->counts.mv;
-      read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv,
               &cm->fc.nmvc, mv_counts, allow_hp);
       if (is_compound)
-        read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+        read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv,
                 &cm->fc.nmvc, mv_counts, allow_hp);
       for (i = 0; i < 1 + is_compound; ++i) {
         ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
@@ -380,17 +380,20 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
     }
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
-      if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = nearest_mv[1].as_int;
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
-      if (is_compound) mv[1].as_int = near_mv[1].as_int;
+      if (is_compound)
+        mv[1].as_int = near_mv[1].as_int;
       break;
     }
     case ZEROMV: {
       mv[0].as_int = 0;
-      if (is_compound) mv[1].as_int = 0;
+      if (is_compound)
+        mv[1].as_int = 0;
       break;
     }
     default: {
@@ -423,7 +426,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
 
-  int_mv nearest[2], nearmv[2], best[2];
+  int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
@@ -452,8 +455,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
-                            &nearest[ref], &nearmv[ref]);
-      best[ref].as_int = nearest[ref].as_int;
+                            &nearestmv[ref], &nearmv[ref]);
     }
   }
 
@@ -466,6 +468,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
     int b_mode;
+    int_mv nearest_sub8x8[2], near_sub8x8[2];
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
@@ -475,9 +478,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
         if (b_mode == NEARESTMV || b_mode == NEARMV)
           for (ref = 0; ref < 1 + is_compound; ++ref)
             vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
-                                          &nearest[ref], &nearmv[ref]);
+                                          &nearest_sub8x8[ref],
+                                          &near_sub8x8[ref]);
 
-        if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+        if (!assign_mv(cm, b_mode, block, nearestmv,
+                       nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
@@ -499,9 +504,8 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
-                                best, nearest, nearmv,
-                                is_compound, allow_hp, r);
+    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+                                nearestmv, nearmv, is_compound, allow_hp, r);
   }
 }
 
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index ec4dc14f4..7188d7674 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -260,6 +260,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   struct segmentation *seg = &cm->seg;
   MB_MODE_INFO *const mi = &m->mbmi;
   const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
+  const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
   int skip_coeff;
@@ -355,11 +356,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
             active_section = 11;
 #endif
             vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv,
-                          &mi->best_mv[0].as_mv, nmvc, allow_hp);
+                          &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
 
             if (has_second_ref(mi))
               vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv,
-                            &mi->best_mv[1].as_mv, nmvc, allow_hp);
+                            &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
           }
         }
       }
@@ -368,11 +369,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
       active_section = 5;
 #endif
       vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv,
-                    &mi->best_mv[0].as_mv, nmvc, allow_hp);
+                    &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp);
 
       if (has_second_ref(mi))
         vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv,
-                      &mi->best_mv[1].as_mv, nmvc, allow_hp);
+                      &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp);
     }
   }
 }
@@ -745,7 +746,6 @@ static void update_coef_probs(VP9_COMP* cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
-  vp9_clear_system_state();
 
   for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
     build_tree_distribution(cpi, tx_size);
@@ -1295,8 +1295,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
     active_section = 7;
 #endif
 
-  vp9_clear_system_state();  // __asm emms;
-
   first_part_size = write_compressed_header(cpi, data);
   data += first_part_size;
   vp9_wb_write_literal(&saved_wb, first_part_size, 16);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 75ea64a2f..9d02c8f95 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -500,17 +500,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     if (is_inter_block(mbmi) &&
         (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv[2];
-      const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
-      best_mv[0].as_int = ctx->best_ref_mv[0].as_int;
-      best_mv[1].as_int = ctx->best_ref_mv[1].as_int;
-      if (mbmi->mode == NEWMV) {
-        best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int;
-        if (rf2 > 0)
-          best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int;
-      }
-      mbmi->best_mv[0].as_int = best_mv[0].as_int;
-      mbmi->best_mv[1].as_int = best_mv[1].as_int;
+      for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
+        best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
       vp9_update_mv_count(cpi, x, best_mv);
     }
 
@@ -630,11 +621,11 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, int mi_col,
-                          int *totalrate, int64_t *totaldist,
-                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                          int64_t best_rd) {
+static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, int mi_col,
+                             int *totalrate, int64_t *totaldist,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1079,35 +1070,35 @@ static void pick_partition_type(VP9_COMP *cpi,
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
-                    bsize, get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist,
+                       bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
       if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) {
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
-                      &sub_rate[1], &sub_dist[1], subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
       }
       *rate = sub_rate[0] + sub_rate[1];
       *dist = sub_dist[0] + sub_dist[1];
       break;
     case PARTITION_VERT:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0],
+                       subsize, get_block_context(x, subsize), INT64_MAX);
       if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) {
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
-                      &sub_rate[1], &sub_dist[1], subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize,
+                         &sub_rate[1], &sub_dist[1], subsize,
+                         get_block_context(x, subsize), INT64_MAX);
       }
       *rate = sub_rate[0] + sub_rate[1];
       *dist = sub_dist[1] + sub_dist[1];
@@ -1244,8 +1235,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         mi_row + (ms >> 1) < cm->mi_rows &&
         mi_col + (ms >> 1) < cm->mi_cols) {
       *(get_sb_partitioning(x, bsize)) = bsize;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
-                    get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
 
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
@@ -1260,13 +1251,15 @@ static void rd_use_partition(VP9_COMP *cpi,
 
   switch (partition) {
     case PARTITION_NONE:
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    bsize, get_block_context(x, bsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, bsize,
+                       get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
         int rt = 0;
@@ -1274,8 +1267,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1288,8 +1281,9 @@ static void rd_use_partition(VP9_COMP *cpi,
       break;
     case PARTITION_VERT:
       *get_sb_index(x, subsize) = 0;
-      pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
-                    subsize, get_block_context(x, subsize), INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate,
+                       &last_part_dist, subsize,
+                       get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
         int rt = 0;
@@ -1297,8 +1291,8 @@ static void rd_use_partition(VP9_COMP *cpi,
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
         *get_sb_index(x, subsize) = 1;
-        pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
-                      get_block_context(x, subsize), INT64_MAX);
+        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
+                         subsize, get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
           last_part_rate = INT_MAX;
           last_part_dist = INT_MAX;
@@ -1372,9 +1366,9 @@ static void rd_use_partition(VP9_COMP *cpi,
 
       save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
-                    split_subsize, get_block_context(x, split_subsize),
-                    INT64_MAX);
+      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+                       split_subsize, get_block_context(x, split_subsize),
+                       INT64_MAX);
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -1738,8 +1732,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
-                  get_block_context(x, bsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
+                     get_block_context(x, bsize), best_rd);
     if (this_rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(cpi->above_seg_context,
@@ -1849,8 +1843,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
 
     if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
@@ -1864,9 +1858,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1902,8 +1896,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       get_block_context(x, subsize)->pred_filter_type =
           get_block_context(x, bsize)->mic.mbmi.interp_filter;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
-                  get_block_context(x, subsize), best_rd);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+                     get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
@@ -1916,9 +1910,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         get_block_context(x, subsize)->pred_filter_type =
             get_block_context(x, bsize)->mic.mbmi.interp_filter;
-      pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
-                    &this_dist, subsize, get_block_context(x, subsize),
-                    best_rd - sum_rd);
+      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
+                       &this_dist, subsize, get_block_context(x, subsize),
+                       best_rd - sum_rd);
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
       } else {
@@ -1989,8 +1983,8 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
   if ((mi_row + (ms >> 1) < cm->mi_rows) &&
       (mi_col + (ms >> 1) < cm->mi_cols)) {
     cpi->set_ref_frame_mask = 1;
-    pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
-                  get_block_context(x, BLOCK_64X64), INT64_MAX);
+    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
+                     get_block_context(x, BLOCK_64X64), INT64_MAX);
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, BLOCK_64X64);
     r += x->partition_cost[pl][PARTITION_NONE];
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index af710a8f4..853094b29 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -224,13 +224,9 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   }
 }
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
+                              int usehp, int mvc_flag_v, int mvc_flag_h) {
   vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
   if (mvc_flag_v)
     build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 538599d58..56872682a 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -317,31 +317,23 @@ static const double weight_table[256] = {
   1.000000, 1.000000, 1.000000, 1.000000
 };
 
-static double simple_weight(YV12_BUFFER_CONFIG *source) {
+static double simple_weight(const YV12_BUFFER_CONFIG *buf) {
   int i, j;
+  double sum = 0.0;
+  const int w = buf->y_crop_width;
+  const int h = buf->y_crop_height;
+  const uint8_t *row = buf->y_buffer;
+
+  for (i = 0; i < h; ++i) {
+    const uint8_t *pixel = row;
+    for (j = 0; j < w; ++j)
+      sum += weight_table[*pixel++];
+    row += buf->y_stride;
+  }
 
-  uint8_t *src = source->y_buffer;
-  double sum_weights = 0.0;
-
-  // Loop through the Y plane examining levels and creating a weight for
-  // the image.
-  i = source->y_height;
-  do {
-    j = source->y_width;
-    do {
-      sum_weights += weight_table[ *src];
-      src++;
-    } while (--j);
-    src -= source->y_width;
-    src += source->y_stride;
-  } while (--i);
-
-  sum_weights /= (source->y_height * source->y_width);
-
-  return sum_weights;
+  return MAX(0.1, sum / (w * h));
 }
 
-
 // This function returns the maximum target rate per frame.
 static int frame_max_bits(VP9_COMP *cpi) {
   int64_t max_bits =
@@ -394,42 +386,35 @@ static unsigned int zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 }
 
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                     MV *ref_mv, MV *best_mv,
+                                     const MV *ref_mv, MV *best_mv,
                                      YV12_BUFFER_CONFIG *recon_buffer,
                                      int *best_motion_err, int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int num00;
-
   MV tmp_mv = {0, 0};
-  MV ref_mv_full;
-
-  int tmp_err;
+  MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int num00, tmp_err, n, sr = 0;
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  int n;
-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
+  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   int new_mv_mode_penalty = 256;
-
-  int sr = 0;
-  int quart_frm = MIN(cpi->common.width, cpi->common.height);
+  const int quart_frm = MIN(cpi->common.width, cpi->common.height);
 
   // refine the motion search range accroding to the frame dimension
   // for first pass test
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  step_param    += sr;
+  step_param += sr;
   further_steps -= sr;
 
   // override the default variance function to use MSE
-  v_fn_ptr.vf = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type);
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
 
   // Set up pointers for this macro block recon buffer
   xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
   // Initial step/diamond search centred on best mv
-  ref_mv_full.col = ref_mv->col >> 3;
-  ref_mv_full.row = ref_mv->row >> 3;
   tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                     step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
@@ -504,6 +489,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   int new_mv_count = 0;
   int sum_in_vectors = 0;
   uint32_t lastmv_as_int = 0;
+  struct twopass_rc *const twopass = &cpi->twopass;
 
   int_mv zero_ref_mv;
 
@@ -792,20 +778,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fps.intra_error = intra_error >> 8;
     fps.coded_error = coded_error >> 8;
     fps.sr_coded_error = sr_coded_error >> 8;
-    fps.ssim_weighted_pred_err = fps.coded_error *
-                                     MAX(0.1, simple_weight(cpi->Source));
-    fps.pcnt_inter = 0.0;
-    fps.pcnt_motion = 0.0;
-    fps.MVr = 0.0;
-    fps.mvr_abs = 0.0;
-    fps.MVc = 0.0;
-    fps.mvc_abs = 0.0;
-    fps.MVrv = 0.0;
-    fps.MVcv = 0.0;
-    fps.mv_in_out_count = 0.0;
-    fps.new_mv_count = 0.0;
+    fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source);
     fps.count = 1.0;
-
     fps.pcnt_inter = (double)intercount / cm->MBs;
     fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
     fps.pcnt_neutral = (double)neutral_count / cm->MBs;
@@ -821,7 +795,17 @@ void vp9_first_pass(VP9_COMP *cpi) {
                      mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+      fps.pcnt_motion = (double)mvcount / cpi->common.MBs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
     }
 
     // TODO(paulwilkins):  Handle the case when duration is set to 0, or
@@ -830,23 +814,22 @@ void vp9_first_pass(VP9_COMP *cpi) {
     fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    cpi->twopass.this_frame_stats = fps;
-    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
-    accumulate_stats(&cpi->twopass.total_stats, &fps);
+    twopass->this_frame_stats = fps;
+    output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
+    accumulate_stats(&twopass->total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
   // the prediction is good enough... but also dont allow it to lag too far
-  if ((cpi->twopass.sr_update_lag > 3) ||
+  if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats.intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
-        2.0))) {
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
-    cpi->twopass.sr_update_lag = 1;
+    twopass->sr_update_lag = 1;
   } else {
-    cpi->twopass.sr_update_lag++;
+    twopass->sr_update_lag++;
   }
   // swap frame pointers so last frame refers to the frame we just compressed
   swap_yv12(lst_yv12, new_yv12);
@@ -1034,37 +1017,38 @@ extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 void vp9_init_second_pass(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   FIRSTPASS_STATS *start_pos;
+  struct twopass_rc *const twopass = &cpi->twopass;
 
-  zero_stats(&cpi->twopass.total_stats);
-  zero_stats(&cpi->twopass.total_left_stats);
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
 
-  if (!cpi->twopass.stats_in_end)
+  if (!twopass->stats_in_end)
     return;
 
-  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
+  twopass->total_stats = *twopass->stats_in_end;
+  twopass->total_left_stats = twopass->total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
-                       cpi->twopass.total_stats.duration);
+  vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count /
+                        twopass->total_stats.duration);
 
   cpi->output_framerate = cpi->oxcf.framerate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
-                                     cpi->oxcf.target_bandwidth / 10000000.0);
+  twopass->bits_left = (int64_t)(twopass->total_stats.duration *
+                                 cpi->oxcf.target_bandwidth / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
   // scores used in the second pass. We have this minimum to make sure
   // that clips that are static but "low complexity" in the intra domain
   // are still boosted appropriately for KF/GF/ARF
-  cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
-  cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+  twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+  twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
   // This variable monitors how far behind the second ref update is lagging
-  cpi->twopass.sr_update_lag = 1;
+  twopass->sr_update_lag = 1;
 
   // Scan the first pass file and calculate an average Intra / Inter error score
   // ratio for the sequence.
@@ -1072,43 +1056,43 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     double sum_iiratio = 0.0;
     double IIRatio;
 
-    start_pos = cpi->twopass.stats_in;  // Note the starting "file" position.
+    start_pos = twopass->stats_in;  // Note the starting "file" position.
 
-    while (input_stats(&cpi->twopass, &this_frame) != EOF) {
+    while (input_stats(twopass, &this_frame) != EOF) {
       IIRatio = this_frame.intra_error
                 / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
       IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio /
-        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
+    twopass->avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count);
 
     // Reset file position
-    reset_fpf_position(&cpi->twopass, start_pos);
+    reset_fpf_position(twopass, start_pos);
   }
 
   // Scan the first pass file and calculate a modified total error based upon
   // the bias/power function used to allocate bits.
   {
-    double av_error = cpi->twopass.total_stats.ssim_weighted_pred_err /
-                      DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.count);
+    double av_error = twopass->total_stats.ssim_weighted_pred_err /
+                      DOUBLE_DIVIDE_CHECK(twopass->total_stats.count);
 
-    start_pos = cpi->twopass.stats_in;  // Note starting "file" position
+    start_pos = twopass->stats_in;  // Note starting "file" position
 
-    cpi->twopass.modified_error_total = 0.0;
-    cpi->twopass.modified_error_min =
+    twopass->modified_error_total = 0.0;
+    twopass->modified_error_min =
       (av_error * cpi->oxcf.two_pass_vbrmin_section) / 100;
-    cpi->twopass.modified_error_max =
+    twopass->modified_error_max =
       (av_error * cpi->oxcf.two_pass_vbrmax_section) / 100;
 
-    while (input_stats(&cpi->twopass, &this_frame) != EOF) {
-      cpi->twopass.modified_error_total +=
+    while (input_stats(twopass, &this_frame) != EOF) {
+      twopass->modified_error_total +=
           calculate_modified_err(cpi, &this_frame);
     }
-    cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+    twopass->modified_error_left = twopass->modified_error_total;
 
-    reset_fpf_position(&cpi->twopass, start_pos);  // Reset file position
+    reset_fpf_position(twopass, start_pos);
   }
 }
 
@@ -1965,7 +1949,10 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) {
        cpi->rc.frames_to_key == 0 ||
        (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -1982,7 +1969,10 @@ void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) {
       cpi->rc.frames_to_key == 0 ||
       (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
     cm->frame_type = KEY_FRAME;
+    cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 &&
+                                    cpi->rc.frames_to_key == 0;
     cpi->rc.frames_to_key = cpi->key_frame_frequency;
+    cpi->rc.kf_boost = 300;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2054,7 +2044,8 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) {
   this_frame_coded_error = this_frame.coded_error;
 
   // keyframe and section processing !
-  if (rc->frames_to_key == 0) {
+  if (rc->frames_to_key == 0 ||
+      (cpi->common.frame_flags & FRAMEFLAGS_KEY)) {
     // Define next KF group and assign bits to it
     this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
@@ -2225,12 +2216,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
   RATE_CONTROL *const rc = &cpi->rc;
+  struct twopass_rc *const twopass = &cpi->twopass;
 
   vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
 
-  start_position = cpi->twopass.stats_in;
+  start_position = twopass->stats_in;
   cpi->common.frame_type = KEY_FRAME;
 
   // is this a forced key frame by interval
@@ -2247,14 +2239,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Take a copy of the initial frame details
   first_frame = *this_frame;
 
-  cpi->twopass.kf_group_bits = 0;        // Total bits available to kf group
-  cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
 
   // find the next keyframe
   i = 0;
-  while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) {
+  while (twopass->stats_in < twopass->stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
@@ -2266,11 +2258,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // load a the next frame's stats
     last_frame = *this_frame;
-    input_stats(&cpi->twopass, this_frame);
+    input_stats(twopass, this_frame);
 
     // Provided that we are not at the end of the file...
     if (cpi->oxcf.auto_key &&
-        lookup_next_frame_stats(&cpi->twopass, &next_frame) != EOF) {
+        lookup_next_frame_stats(twopass, &next_frame) != EOF) {
       // Normal scene cut check
       if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
@@ -2320,7 +2312,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     tmp_frame = first_frame;
 
     // Reset to the start of the group
-    reset_fpf_position(&cpi->twopass, start_position);
+    reset_fpf_position(twopass, start_position);
 
     kf_group_err = 0;
     kf_group_intra_err = 0;
@@ -2334,17 +2326,17 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       kf_group_coded_err += tmp_frame.coded_error;
 
       // Load a the next frame's stats
-      input_stats(&cpi->twopass, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
-  } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) {
+  } else if (twopass->stats_in == twopass->stats_in_end) {
     rc->next_key_frame_forced = 1;
   } else {
     rc->next_key_frame_forced = 0;
   }
 
   // Special case for the last key frame of the file
-  if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
+  if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error
     kf_group_err += calculate_modified_err(cpi, this_frame);
 
@@ -2356,8 +2348,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
-  if ((cpi->twopass.bits_left > 0) &&
-      (cpi->twopass.modified_error_left > 0.0)) {
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
     // Max for a single normal frame (not key frame)
     int max_bits = frame_max_bits(cpi);
 
@@ -2366,19 +2357,18 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Default allocation based on bits left and relative
     // complexity of the section
-    cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left *
-                                           (kf_group_err /
-                                            cpi->twopass.modified_error_left));
+    twopass->kf_group_bits = (int64_t)(twopass->bits_left *
+       (kf_group_err / twopass->modified_error_left));
 
     // Clip based on maximum per frame rate defined by the user.
     max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
-    if (cpi->twopass.kf_group_bits > max_grp_bits)
-      cpi->twopass.kf_group_bits = max_grp_bits;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
   } else {
-    cpi->twopass.kf_group_bits = 0;
+    twopass->kf_group_bits = 0;
   }
   // Reset the first pass file position
-  reset_fpf_position(&cpi->twopass, start_position);
+  reset_fpf_position(twopass, start_position);
 
   // Determine how big to make this keyframe based on how well the subsequent
   // frames use inter blocks.
@@ -2390,7 +2380,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   for (i = 0; i < rc->frames_to_key; i++) {
     double r;
 
-    if (EOF == input_stats(&cpi->twopass, &next_frame))
+    if (EOF == input_stats(twopass, &next_frame))
       break;
 
     // Monitor for static sections.
@@ -2402,11 +2392,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // For the first few frames collect data to decide kf boost.
     if (i <= (rc->max_gf_interval * 2)) {
-      if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+      if (next_frame.intra_error > twopass->kf_intra_err_min)
         r = (IIKFACTOR2 * next_frame.intra_error /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
       else
-        r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+        r = (IIKFACTOR2 * twopass->kf_intra_err_min /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
 
       if (r > RMAX)
@@ -2428,21 +2418,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     FIRSTPASS_STATS sectionstats;
 
     zero_stats(&sectionstats);
-    reset_fpf_position(&cpi->twopass, start_position);
+    reset_fpf_position(twopass, start_position);
 
     for (i = 0; i < rc->frames_to_key; i++) {
-      input_stats(&cpi->twopass, &next_frame);
+      input_stats(twopass, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
 
     avg_stats(&sectionstats);
 
-    cpi->twopass.section_intra_rating = (int) (sectionstats.intra_error /
+    twopass->section_intra_rating = (int) (sectionstats.intra_error /
         DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
   }
 
   // Reset the first pass file position
-  reset_fpf_position(&cpi->twopass, start_position);
+  reset_fpf_position(twopass, start_position);
 
   // Work out how many bits to allocate for the key frame itself
   if (1) {
@@ -2459,7 +2449,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
     rc->kf_boost = kf_boost;
-    cpi->twopass.kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+    twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
     // We do three calculations for kf size.
     // The first is based on the error score for the whole kf group.
@@ -2474,11 +2464,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // cpi->rc.frames_to_key-1 because key frame itself is taken
     // care of by kf_boost.
     if (zero_motion_accumulator >= 0.99) {
-      allocation_chunks =
-        ((rc->frames_to_key - 1) * 10) + kf_boost;
+      allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost;
     } else {
-      allocation_chunks =
-        ((rc->frames_to_key - 1) * 100) + kf_boost;
+      allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost;
     }
 
     // Prevent overflow
@@ -2488,58 +2476,54 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       allocation_chunks /= divisor;
     }
 
-    cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0
-           : cpi->twopass.kf_group_bits;
+    twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0
+           : twopass->kf_group_bits;
 
     // Calculate the number of bits to be spent on the key frame
-    cpi->twopass.kf_bits = (int)((double)kf_boost *
-              ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+    twopass->kf_bits = (int)((double)kf_boost *
+        ((double)twopass->kf_group_bits / allocation_chunks));
 
     // If the key frame is actually easier than the average for the
     // kf group (which does sometimes happen... eg a blank intro frame)
     // Then use an alternate calculation based on the kf error score
     // which should give a smaller key frame.
     if (kf_mod_err < kf_group_err / rc->frames_to_key) {
-      double  alt_kf_grp_bits =
-        ((double)cpi->twopass.bits_left *
+      double  alt_kf_grp_bits = ((double)twopass->bits_left *
          (kf_mod_err * (double)rc->frames_to_key) /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+         DOUBLE_DIVIDE_CHECK(twopass->modified_error_left));
 
       alt_kf_bits = (int)((double)kf_boost *
                           (alt_kf_grp_bits / (double)allocation_chunks));
 
-      if (cpi->twopass.kf_bits > alt_kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
-      }
+      if (twopass->kf_bits > alt_kf_bits)
+        twopass->kf_bits = alt_kf_bits;
     } else {
     // Else if it is much harder than other frames in the group make sure
     // it at least receives an allocation in keeping with its relative
     // error score
-      alt_kf_bits = (int)((double)cpi->twopass.bits_left *
-              (kf_mod_err /
-               DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+      alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
+               DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
 
-      if (alt_kf_bits > cpi->twopass.kf_bits) {
-        cpi->twopass.kf_bits = alt_kf_bits;
+      if (alt_kf_bits > twopass->kf_bits) {
+        twopass->kf_bits = alt_kf_bits;
       }
     }
 
-    cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+    twopass->kf_group_bits -= twopass->kf_bits;
 
     // Peer frame bit target for this frame
-    rc->per_frame_bandwidth = cpi->twopass.kf_bits;
+    rc->per_frame_bandwidth = twopass->kf_bits;
     // Convert to a per second bitrate
-    cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
-                                  cpi->output_framerate);
+    cpi->target_bandwidth = (int)(twopass->kf_bits * cpi->output_framerate);
   }
 
   // Note the total error score of the kf group minus the key frame itself
-  cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
-  cpi->twopass.modified_error_left -= kf_group_err;
+  twopass->modified_error_left -= kf_group_err;
 }
 
 void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 277bd7db1..ee73ff15a 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -73,7 +73,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
     for (i = 0; i < depth; i++)
       if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
                                  width, height, subsampling_x, subsampling_y,
-                                 VP9BORDERINPIXELS))
+                                 VP9_ENC_BORDER_IN_PIXELS))
         goto bail;
   }
   return ctx;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index efb5ce16d..ad9cc00b1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -24,10 +24,15 @@
 // #define NEW_DIAMOND_SEARCH
 
 void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) {
-  const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
-  const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
-  const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
-  const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = MAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = MAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = MIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = MIN(row_max, (MV_UPP >> 3) - 1);
 
   // Get intersection of UMV window and valid MV window to reduce # of checks
   // in diamond search.
@@ -174,8 +179,10 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
       error_per_bit + 4096) >> 13 : 0)
 
 
-#define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset
-                                // for svf calc
+// convert motion vector component to offset for svf calc
+static INLINE int sp(int x) {
+  return (x & 7) << 1;
+}
 
 #define IFMVCV(r, c, s, e)                                \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) \
@@ -183,12 +190,14 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
     else                                                  \
       e;
 
-/* pointer to predictor base of a motionvector */
-#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset)))
+static INLINE uint8_t *pre(uint8_t *buf, int stride, int r, int c, int offset) {
+  return &buf[(r >> 3) * stride + (c >> 3) - offset];
+}
 
 /* returns subpixel variance error function */
 #define DIST(r, c) \
-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse)
+    vfp->svf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), z, \
+             src_stride, &sse)
 
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
@@ -358,7 +367,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
 #undef DIST
 /* returns subpixel variance error function */
 #define DIST(r, c) \
-    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+    vfp->svaf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), \
               z, src_stride, &sse, second_pred)
 
 int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
@@ -1703,58 +1712,51 @@ int vp9_full_search_sadx8(MACROBLOCK *x, MV *ref_mv,
   else
     return INT_MAX;
 }
-int vp9_refining_search_sad_c(MACROBLOCK *x,
+
+int vp9_refining_search_sad_c(const MACROBLOCK *x,
                               MV *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
                               const MV *center_mv) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
-  int this_row_offset, this_col_offset;
 
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->row * xd->plane[0].pre[0].stride) +
-                          ref_mv->col;
-  uint8_t *check_here;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
+  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
+                                             ref_mv->col];
   unsigned int thissad;
+
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   MV this_mv;
-  unsigned int bestsad = INT_MAX;
-  MV fcenter_mv;
 
-  int *mvjsadcost = x->nmvjointsadcost;
+  const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.row = center_mv->row >> 3;
-  fcenter_mv.col = center_mv->col >> 3;
-
-  bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                        in_what_stride, 0x7fffffff) +
-                        mvsad_err_cost(ref_mv, &fcenter_mv,
-                                       mvjsadcost, mvsadcost, error_per_bit);
+  unsigned int bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                                     in_what_stride, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_row_offset = ref_mv->row + neighbors[j].row;
-      this_col_offset = ref_mv->col + neighbors[j].col;
-
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-                     best_address;
+      this_mv.row = ref_mv->row + neighbors[j].row;
+      this_mv.col = ref_mv->col + neighbors[j].col;
+
+      if ((this_mv.col > x->mv_col_min) &&
+          (this_mv.col < x->mv_col_max) &&
+          (this_mv.row > x->mv_row_min) &&
+          (this_mv.row < x->mv_row_max)) {
+        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
+                                                this_mv.col];
         thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
                               bestsad);
 
         if (thissad < bestsad) {
-          this_mv.row = this_row_offset;
-          this_mv.col = this_col_offset;
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
 
@@ -1771,8 +1773,7 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
     } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-                      neighbors[best_site].col;
+      best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col];
     }
   }
 
@@ -1782,13 +1783,12 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv, center_mv,
-                                  mvjcost, mvcost, x->errorperbit);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
 
-int vp9_refining_search_sadx4(MACROBLOCK *x,
+int vp9_refining_search_sadx4(const MACROBLOCK *x,
                               MV *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2],
@@ -1810,7 +1810,7 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
   unsigned int bestsad = INT_MAX;
   MV fcenter_mv;
 
-  int *mvjsadcost = x->nmvjointsadcost;
+  const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
   fcenter_mv.row = center_mv->row >> 3;
@@ -1903,65 +1903,54 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     return INT_MAX;
 }
 
-/* This function is called when we do joint motion search in comp_inter_inter
- * mode.
- */
-int vp9_refining_search_8p_c(MACROBLOCK *x,
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2], const MV *center_mv,
                              const uint8_t *second_pred, int w, int h) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
-      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+                           {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
   int i, j;
-  int this_row_offset, this_col_offset;
 
-  int what_stride = x->plane[0].src.stride;
-  int in_what_stride = xd->plane[0].pre[0].stride;
-  uint8_t *what = x->plane[0].src.buf;
-  uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->row * xd->plane[0].pre[0].stride) +
-                          ref_mv->col;
-  uint8_t *check_here;
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what = xd->plane[0].pre[0].buf;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
+                                             ref_mv->col];
   unsigned int thissad;
   MV this_mv;
-  unsigned int bestsad = INT_MAX;
-  MV fcenter_mv;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
 
-  int *mvjsadcost = x->nmvjointsadcost;
+  const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.row = center_mv->row >> 3;
-  fcenter_mv.col = center_mv->col >> 3;
-
   /* Get compound pred by averaging two pred blocks. */
-  bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride,
-                         second_pred, 0x7fffffff) +
-      mvsad_err_cost(ref_mv, &fcenter_mv,
-                     mvjsadcost, mvsadcost, error_per_bit);
+  unsigned int bestsad = fn_ptr->sdaf(what, what_stride,
+                                      best_address, in_what_stride,
+                                      second_pred, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
 
-  for (i = 0; i < search_range; i++) {
+  for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; j++) {
-      this_row_offset = ref_mv->row + neighbors[j].row;
-      this_col_offset = ref_mv->col + neighbors[j].col;
+      this_mv.row = ref_mv->row + neighbors[j].row;
+      this_mv.col = ref_mv->col + neighbors[j].col;
 
-      if ((this_col_offset > x->mv_col_min) &&
-          (this_col_offset < x->mv_col_max) &&
-          (this_row_offset > x->mv_row_min) &&
-          (this_row_offset < x->mv_row_max)) {
-        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
-            best_address;
+      if ((this_mv.col > x->mv_col_min) &&
+          (this_mv.col < x->mv_col_max) &&
+          (this_mv.row > x->mv_row_min) &&
+          (this_mv.row < x->mv_row_max)) {
+        const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
+                                                this_mv.col];
 
-        /* Get compound block and use it to calculate SAD. */
         thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride,
                                second_pred, bestsad);
-
         if (thissad < bestsad) {
-          this_mv.row = this_row_offset;
-          this_mv.col = this_col_offset;
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
           if (thissad < bestsad) {
@@ -1977,8 +1966,7 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
     } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
-      best_address += (neighbors[best_site].row) * in_what_stride +
-          neighbors[best_site].col;
+      best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col];
     }
   }
 
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index b3d89752d..74035842f 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -108,7 +108,7 @@ typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
                                     int *mvjcost, int *mvcost[2],
                                     const MV *center_mv, int n);
 
-typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x,
+typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
                                         MV *ref_mv, int sad_per_bit,
                                         int distance,
                                         vp9_variance_fn_ptr_t *fn_ptr,
@@ -123,7 +123,7 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int *mvjcost, int *mvcost[2],
                                        const MV *center_mv);
 
-int vp9_refining_search_8p_c(MACROBLOCK *x,
+int vp9_refining_search_8p_c(const MACROBLOCK *x,
                              MV *ref_mv, int error_per_bit,
                              int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                              int *mvjcost, int *mvcost[2],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 42d4196c5..27531d232 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -36,6 +36,7 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
 #include "vp9/encoder/vp9_vaq.h"
+#include "vp9/encoder/vp9_resize.h"
 
 #include "vpx_ports/vpx_timer.h"
 
@@ -968,7 +969,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                cpi->oxcf.width, cpi->oxcf.height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -983,14 +984,14 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
   if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
                              cm->width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
-                             VP9BORDERINPIXELS))
+                             VP9_ENC_BORDER_IN_PIXELS))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   if (vp9_alloc_frame_buffer(&cpi->scaled_source,
                              cm->width, cm->height,
                              cm->subsampling_x, cm->subsampling_y,
-                             VP9BORDERINPIXELS))
+                             VP9_ENC_BORDER_IN_PIXELS))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -1036,14 +1037,14 @@ static void update_frame_size(VP9_COMP *cpi) {
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
   if (vp9_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -2295,6 +2296,42 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 }
 #endif
 
+static void scale_and_extend_frame_nonnormative(YV12_BUFFER_CONFIG *src_fb,
+                                                YV12_BUFFER_CONFIG *dst_fb) {
+  const int in_w = src_fb->y_crop_width;
+  const int in_h = src_fb->y_crop_height;
+  const int out_w = dst_fb->y_crop_width;
+  const int out_h = dst_fb->y_crop_height;
+  const int in_w_uv = src_fb->uv_crop_width;
+  const int in_h_uv = src_fb->uv_crop_height;
+  const int out_w_uv = dst_fb->uv_crop_width;
+  const int out_h_uv = dst_fb->uv_crop_height;
+  int i;
+
+  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
+    src_fb->alpha_buffer};
+  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
+    src_fb->alpha_stride};
+
+  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
+    dst_fb->alpha_buffer};
+  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
+    dst_fb->alpha_stride};
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    if (i == 0 || i == 3) {
+      // Y and alpha planes
+      vp9_resize_plane(srcs[i], in_h, in_w, src_strides[i],
+                       dsts[i], out_h, out_w, dst_strides[i]);
+    } else {
+      // Chroma planes
+      vp9_resize_plane(srcs[i], in_h_uv, in_w_uv, src_strides[i],
+                       dsts[i], out_h_uv, out_w_uv, dst_strides[i]);
+    }
+  }
+  vp8_yv12_extend_frame_borders(dst_fb);
+}
+
 static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
                                    YV12_BUFFER_CONFIG *dst_fb) {
   const int in_w = src_fb->y_crop_width;
@@ -2316,7 +2353,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
       for (i = 0; i < MAX_MB_PLANE; ++i) {
-        const int factor = i == 0 ? 1 : 2;
+        const int factor = (i == 0 || i == 3 ? 1 : 2);
         const int x_q4 = x * (16 / factor) * in_w / out_w;
         const int y_q4 = y * (16 / factor) * in_h / out_h;
         const int src_stride = src_strides[i];
@@ -2552,7 +2589,7 @@ static void scale_references(VP9_COMP *cpi) {
       vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS, NULL, NULL, NULL);
+                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
     } else {
@@ -2924,7 +2961,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   /* Scale the source buffer, if required. */
   if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
       cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
-    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+    scale_and_extend_frame_nonnormative(cpi->un_scaled_source,
+                                        &cpi->scaled_source);
     cpi->Source = &cpi->scaled_source;
   } else {
     cpi->Source = cpi->un_scaled_source;
@@ -3279,12 +3317,12 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
   vp9_twopass_postencode_update(cpi, *size);
 }
 
-static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
+static void check_initial_width(VP9_COMP *cpi, int subsampling_x,
+                                int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
   if (!cpi->initial_width) {
-    // TODO(agrange) Subsampling defaults to assuming sampled chroma.
-    cm->subsampling_x = sd != NULL ? (sd->uv_width < sd->y_width) : 1;
-    cm->subsampling_y = sd != NULL ? (sd->uv_height < sd->y_height) : 1;
+    cm->subsampling_x = subsampling_x;
+    cm->subsampling_y = subsampling_y;
     alloc_raw_frame_buffers(cpi);
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
@@ -3298,8 +3336,10 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
   VP9_COMP              *cpi = (VP9_COMP *) ptr;
   struct vpx_usec_timer  timer;
   int                    res = 0;
+  const int    subsampling_x = sd->uv_width  < sd->y_width;
+  const int    subsampling_y = sd->uv_height < sd->y_height;
 
-  check_initial_width(cpi, sd);
+  check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3377,7 +3417,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
-  // FILE *fp_out = fopen("enc_frame_type.txt", "a");
 
   if (!cpi)
     return -1;
@@ -3499,8 +3538,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
       vp9_end_first_pass(cpi);    /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
-
-    // fclose(fp_out);
     return -1;
   }
 
@@ -3543,7 +3580,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
-                           VP9BORDERINPIXELS, NULL, NULL, NULL);
+                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
 
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@@ -3669,7 +3706,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   }
 
 #endif
-  // fclose(fp_out);
   return 0;
 }
 
@@ -3689,7 +3725,8 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
       *dest = *cpi->common.frame_to_show;
       dest->y_width = cpi->common.width;
       dest->y_height = cpi->common.height;
-      dest->uv_height = cpi->common.height / 2;
+      dest->uv_width = cpi->common.width >> cpi->common.subsampling_x;
+      dest->uv_height = cpi->common.height >> cpi->common.subsampling_y;
       ret = 0;
     } else {
       ret = -1;
@@ -3797,7 +3834,7 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
   VP9_COMP *cpi = (VP9_COMP *)comp;
   VP9_COMMON *cm = &cpi->common;
 
-  check_initial_width(cpi, NULL);
+  check_initial_width(cpi, 1, 1);
 
   if (width) {
     cm->width = width;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 7a5282dda..a4ceabdf1 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -20,77 +20,43 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "./vpx_scale_rtcd.h"
 
-void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
-                                   YV12_BUFFER_CONFIG *dst_ybc, int fraction) {
-  const int height = src_ybc->y_height;
-  const int stride = src_ybc->y_stride;
-  const int offset = stride * ((height >> 5) * 16 - 8);
-  const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4;
-
-  assert(src_ybc->y_stride == dst_ybc->y_stride);
-  vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset,
-             stride * (lines_to_copy + 16));
-}
-
-// Enforce a minimum filter level based upon baseline Q
 static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
-  int min_filter_level;
-  min_filter_level = 0;
-
-  return min_filter_level;
+  return 0;
 }
 
-// Enforce a maximum filter level based upon baseline Q
 static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
-  int max_filter_level = MAX_LOOP_FILTER;
-  (void)base_qindex;
-
-  if (cpi->twopass.section_intra_rating > 8)
-    max_filter_level = MAX_LOOP_FILTER * 3 / 4;
-
-  return max_filter_level;
+  return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                               : MAX_LOOP_FILTER;
 }
 
-
 // Stub function for now Alt LF not used
 void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
 }
 
 void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
-
-  int best_err = 0;
-  int filt_err = 0;
   const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
   const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-
-  int filter_step;
-  int filt_high = 0;
-  // Start search at previous frame filter level
-  int filt_mid = lf->filter_level;
-  int filt_low = 0;
+  int best_err = 0;
+  int filt_err = 0;
   int filt_best;
   int filt_direction = 0;
-
-  int Bias = 0;  // Bias against raising loop filter in favor of lowering it.
-
-  //  Make a copy of the unfiltered / processed recon buffer
-  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
 
   lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
                                                     : cpi->oxcf.sharpness;
 
-  // Start the search at the previous frame filter level unless it is now out of
-  // range.
-  filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
-
-  // Define the initial step size
-  filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
 
   // Get baseline error score
   vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial);
+  vp9_loop_filter_frame(cm, xd, filt_mid, 1, partial);
 
   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
@@ -99,35 +65,32 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
   vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
   while (filter_step > 0) {
-    Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+    const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if (cpi->twopass.section_intra_rating < 20)
-      Bias = Bias * cpi->twopass.section_intra_rating / 20;
+      bias = bias * cpi->twopass.section_intra_rating / 20;
 
     // yx, bias less for large block size
-    if (cpi->common.tx_mode != ONLY_4X4)
-      Bias >>= 1;
-
-    filt_high = ((filt_mid + filter_step) > max_filter_level)
-                    ? max_filter_level
-                    : (filt_mid + filter_step);
-    filt_low = ((filt_mid - filter_step) < min_filter_level)
-                   ? min_filter_level
-                   : (filt_mid - filter_step);
+    if (cm->tx_mode != ONLY_4X4)
+      bias >>= 1;
 
-    if ((filt_direction <= 0) && (filt_low != filt_mid)) {
+    if (filt_direction <= 0 && filt_low != filt_mid) {
       // Get Low filter error score
       vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial);
+      vp9_loop_filter_frame(cm, xd, filt_low, 1, partial);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
-      //  Re-instate the unfiltered frame
+      // Re-instate the unfiltered frame
       vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((filt_err - Bias) < best_err) {
+      if ((filt_err - bias) < best_err) {
         // Was it actually better than the previous best?
         if (filt_err < best_err)
           best_err = filt_err;
@@ -137,9 +100,9 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
     }
 
     // Now look at filt_high
-    if ((filt_direction >= 0) && (filt_high != filt_mid)) {
+    if (filt_direction >= 0 && filt_high != filt_mid) {
       vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial);
+      vp9_loop_filter_frame(cm, xd, filt_high, 1, partial);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -147,7 +110,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) {
       vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
 
       // Was it better than the previous best?
-      if (filt_err < (best_err - Bias)) {
+      if (filt_err < (best_err - bias)) {
         best_err = filt_err;
         filt_best = filt_high;
       }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 17d1f5984..f317f2a0d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -174,7 +174,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+      vp9_setup_buffer_inter(cpi, x, tile,
                              ref_frame, block_size, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 72ab00f98..3ebf98c0f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -211,19 +211,16 @@ static int estimate_bits_at_q(int frame_kind, int q, int mbs,
 
 
 static void calc_iframe_target_size(VP9_COMP *cpi) {
-  // boost defaults to half second
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
   int target;
 
-  // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();  // __asm emms;
 
-  // New Two pass RC
-  target = cpi->rc.per_frame_bandwidth;
-
   // For 1-pass.
   if (cpi->pass == 0) {
     if (cpi->common.current_video_frame == 0) {
-      target = cpi->oxcf.starting_buffer_level / 2;
+      target = oxcf->starting_buffer_level / 2;
     } else {
       // TODO(marpan): Add in adjustment based on Q.
       // If this keyframe was forced, use a more recent Q estimate.
@@ -235,47 +232,49 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
       // Adjustment up based on q: need to fix.
       // kf_boost = kf_boost * kfboost_qadjust(Q) / 100;
       // Frame separation adjustment (down).
-      if (cpi->rc.frames_since_key  < cpi->output_framerate / 2) {
-        kf_boost = (int)(kf_boost * cpi->rc.frames_since_key /
-            (cpi->output_framerate / 2));
+      if (rc->frames_since_key  < cpi->output_framerate / 2) {
+        kf_boost = (int)(kf_boost * rc->frames_since_key /
+                       (cpi->output_framerate / 2));
       }
       kf_boost = (kf_boost < 16) ? 16 : kf_boost;
-      target = ((16 + kf_boost) * cpi->rc.per_frame_bandwidth) >> 4;
+      target = ((16 + kf_boost) * rc->per_frame_bandwidth) >> 4;
     }
-    cpi->rc.active_worst_quality = cpi->rc.worst_quality;
+    rc->active_worst_quality = rc->worst_quality;
+  } else {
+    target = rc->per_frame_bandwidth;
   }
 
-  if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    int max_rate = cpi->rc.per_frame_bandwidth
-                 * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
-
-    if (target > max_rate)
-      target = max_rate;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate = rc->per_frame_bandwidth *
+                             oxcf->rc_max_intra_bitrate_pct / 100;
+    target = MIN(target, max_rate);
   }
-  cpi->rc.this_frame_target = target;
+  rc->this_frame_target = target;
 }
 
 // Update the buffer level: leaky bucket model.
 void vp9_update_buffer_level(VP9_COMP *const cpi, int encoded_frame_size) {
-  VP9_COMMON *const cm = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame) {
     rc->bits_off_target -= encoded_frame_size;
   } else {
     rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size;
   }
+
   // Clip the buffer level to the maximum specified buffer size.
-  if (rc->bits_off_target > cpi->oxcf.maximum_buffer_size) {
-    rc->bits_off_target = cpi->oxcf.maximum_buffer_size;
-  }
-  rc->buffer_level = rc->bits_off_target;
+  rc->buffer_level = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
 }
 
 int vp9_drop_frame(VP9_COMP *const cpi) {
+  const VP9_CONFIG *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
 
-  if (!cpi->oxcf.drop_frames_water_mark) {
+
+  if (!oxcf->drop_frames_water_mark) {
     return 0;
   } else {
     if (rc->buffer_level < 0) {
@@ -284,8 +283,8 @@ int vp9_drop_frame(VP9_COMP *const cpi) {
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
-      int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
-          cpi->oxcf.optimal_buffer_level / 100);
+      int drop_mark = (int)(oxcf->drop_frames_water_mark *
+                                oxcf->optimal_buffer_level / 100);
       if ((rc->buffer_level > drop_mark) &&
           (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
@@ -310,14 +309,14 @@ int vp9_drop_frame(VP9_COMP *const cpi) {
 }
 
 // Adjust active_worst_quality level based on buffer level.
-static int adjust_active_worst_quality_from_buffer_level(const VP9_COMP *cpi) {
+static int adjust_active_worst_quality_from_buffer_level(const VP9_CONFIG *oxcf,
+    const RATE_CONTROL *rc) {
   // Adjust active_worst_quality: If buffer is above the optimal/target level,
   // bring active_worst_quality down depending on fullness over buffer.
   // If buffer is below the optimal level, let the active_worst_quality go from
   // ambient Q (at buffer = optimal level) to worst_quality level
   // (at buffer = critical level).
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const VP9_CONFIG *const oxcf = &cpi->oxcf;
+
   int active_worst_quality = rc->active_worst_quality;
   // Maximum limit for down adjustment, ~20%.
   int max_adjustment_down = active_worst_quality / 5;
@@ -354,31 +353,23 @@ static int adjust_active_worst_quality_from_buffer_level(const VP9_COMP *cpi) {
 }
 
 // Adjust target frame size with respect to the buffering constraints:
-static int target_size_from_buffer_level(const VP9_COMP *cpi) {
-  const RATE_CONTROL *const rc = &cpi->rc;
-  const VP9_CONFIG *const oxcf = &cpi->oxcf;
-  int this_frame_target = cpi->rc.this_frame_target;
-  int percent_low = 0;
-  int percent_high = 0;
-  int one_percent_bits = (int)(1 + oxcf->optimal_buffer_level / 100);
-  if (rc->buffer_level < oxcf->optimal_buffer_level) {
-    percent_low = (int)((oxcf->optimal_buffer_level - rc->buffer_level) /
-                      one_percent_bits);
-    if (percent_low > oxcf->under_shoot_pct)
-      percent_low = oxcf->under_shoot_pct;
+static int target_size_from_buffer_level(const VP9_CONFIG *oxcf,
+                                         const RATE_CONTROL *rc) {
+  int target = rc->this_frame_target;
+  const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
+  const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
 
+  if (diff > 0) {
     // Lower the target bandwidth for this frame.
-    this_frame_target -= (this_frame_target * percent_low) / 200;
-  } else  if (rc->buffer_level > oxcf->optimal_buffer_level) {
-    percent_high = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
-                     one_percent_bits);
-    if (percent_high > oxcf->over_shoot_pct)
-      percent_high = oxcf->over_shoot_pct;
-
+    const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else  if (diff < 0) {
     // Increase the target bandwidth for this frame.
-    this_frame_target += (this_frame_target * percent_high) / 200;
+    const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
   }
-  return this_frame_target;
+
+  return target;
 }
 
 static void calc_pframe_target_size(VP9_COMP *const cpi) {
@@ -400,10 +391,10 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) {
       // For now, use: cpi->rc.av_per_frame_bandwidth / 16:
       min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4,
                              FRAME_OVERHEAD_BITS);
-      rc->this_frame_target = target_size_from_buffer_level(cpi);
+      rc->this_frame_target = target_size_from_buffer_level(oxcf, rc);
       // Adjust qp-max based on buffer level.
       rc->active_worst_quality =
-          adjust_active_worst_quality_from_buffer_level(cpi);
+          adjust_active_worst_quality_from_buffer_level(oxcf, rc);
     }
   }
 
@@ -602,7 +593,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
                                             (last_boosted_q * 0.75));
       active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
     } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) {
-      // not first frame of one pass
+      // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index fa6b362d4..242aa8710 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -233,8 +233,8 @@ static void set_block_thresholds(VP9_COMP *cpi) {
     const int q = compute_rd_thresh_factor(qindex);
 
     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
-      // Threshold here seem unecessarily harsh but fine given actual
-      // range of values used for cpi->sf.thresh_mult[]
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
       const int t = q * rd_thresh_block_size_factor[bsize];
       const int thresh_max = INT_MAX / t;
 
@@ -419,18 +419,12 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-    int rate;
-    int64_t dist;
     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
                               pd->dst.buf, pd->dst.stride, &sse);
     if (i == 0)
       x->pred_sse[ref] = sse;
-    // sse works better than var, since there is no dc prediction used
-    model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
-                             pd->dequant[1] >> 3, &rate, &dist);
 
-    rate_sum += rate;
-    dist_sum += (int)dist;
+    dist_sum += (int)sse;
   }
 
   *out_rate_sum = rate_sum;
@@ -2276,41 +2270,38 @@ static void setup_pred_block(const MACROBLOCKD *xd,
 
 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                             const TileInfo *const tile,
-                            int idx, MV_REFERENCE_FRAME frame_type,
+                            MV_REFERENCE_FRAME ref_frame,
                             BLOCK_SIZE block_size,
                             int mi_row, int mi_col,
                             int_mv frame_nearest_mv[MAX_REF_FRAMES],
                             int_mv frame_near_mv[MAX_REF_FRAMES],
                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
-  VP9_COMMON *cm = &cpi->common;
-  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
+  const VP9_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  const struct scale_factors *const sf = &cm->frame_refs[frame_type - 1].sf;
-
+  MODE_INFO *const mi = xd->mi_8x8[0];
+  int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col, sf, sf);
+  setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
-                   xd->last_mi,
-                   frame_type,
-                   mbmi->ref_mvs[frame_type], mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref_frame, candidates,
+                   mi_row, mi_col);
 
   // Candidate refinement carried out at encoder and decoder
-  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                        mbmi->ref_mvs[frame_type],
-                        &frame_nearest_mv[frame_type],
-                        &frame_near_mv[frame_type]);
+  vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                        &frame_nearest_mv[ref_frame],
+                        &frame_near_mv[ref_frame]);
 
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
-    mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
-            frame_type, block_size);
+    mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
+            ref_frame, block_size);
 }
 
 YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
@@ -3173,7 +3164,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+      vp9_setup_buffer_inter(cpi, x, tile,
                              ref_frame, block_size, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
@@ -3798,7 +3789,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame),
+      vp9_setup_buffer_inter(cpi, x, tile,
                              ref_frame, block_size, mi_row, mi_col,
                              frame_mv[NEARESTMV], frame_mv[NEARMV],
                              yv12_mb);
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 696cf6b11..9ac1f5404 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -39,7 +39,7 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                             const TileInfo *const tile,
-                            int idx, MV_REFERENCE_FRAME frame_type,
+                            MV_REFERENCE_FRAME ref_frame,
                             BLOCK_SIZE block_size,
                             int mi_row, int mi_col,
                             int_mv frame_nearest_mv[MAX_REF_FRAMES],
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
new file mode 100644
index 000000000..f15abc07d
--- /dev/null
+++ b/vp9/encoder/vp9_resize.c
@@ -0,0 +1,418 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_resize.h"
+#include "vpx/vpx_integer.h"
+
+#define FILTER_BITS               7
+
+#define INTERP_TAPS               8
+#define SUBPEL_BITS               5
+#define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS     32
+
+#define ROUND_POWER_OF_TWO(value, n) \
+    (((value) + (1 << ((n) - 1))) >> (n))
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation - note this also filters integer pels.
+const interp_kernel vp9_filteredinterp_filters[(1 << SUBPEL_BITS)] = {
+  {-1, -8, 33, 80, 33, -8, -1, 0},
+  {-1, -8, 30, 80, 35, -8, -1, 1},
+  {-1, -8, 28, 80, 37, -7, -2, 1},
+  {0, -8, 26, 79, 39, -7, -2, 1},
+  {0, -8, 24, 79, 41, -7, -2, 1},
+  {0, -8, 22, 78, 43, -6, -2, 1},
+  {0, -8, 20, 78, 45, -5, -3, 1},
+  {0, -8, 18, 77, 48, -5, -3, 1},
+  {0, -8, 16, 76, 50, -4, -3, 1},
+  {0, -8, 15, 75, 52, -3, -4, 1},
+  {0, -7, 13, 74, 54, -3, -4, 1},
+  {0, -7, 11, 73, 56, -2, -4, 1},
+  {0, -7, 10, 71, 58, -1, -4, 1},
+  {1, -7,  8, 70, 60,  0, -5, 1},
+  {1, -6,  6, 68, 62,  1, -5, 1},
+  {1, -6,  5, 67, 63,  2, -5, 1},
+  {1, -6,  4, 65, 65,  4, -6, 1},
+  {1, -5,  2, 63, 67,  5, -6, 1},
+  {1, -5,  1, 62, 68,  6, -6, 1},
+  {1, -5,  0, 60, 70,  8, -7, 1},
+  {1, -4, -1, 58, 71, 10, -7, 0},
+  {1, -4, -2, 56, 73, 11, -7, 0},
+  {1, -4, -3, 54, 74, 13, -7, 0},
+  {1, -4, -3, 52, 75, 15, -8, 0},
+  {1, -3, -4, 50, 76, 16, -8, 0},
+  {1, -3, -5, 48, 77, 18, -8, 0},
+  {1, -3, -5, 45, 78, 20, -8, 0},
+  {1, -2, -6, 43, 78, 22, -8, 0},
+  {1, -2, -7, 41, 79, 24, -8, 0},
+  {1, -2, -7, 39, 79, 26, -8, 0},
+  {1, -2, -7, 37, 80, 28, -8, -1},
+  {1, -1, -8, 35, 80, 30, -8, -1},
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1};
+static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3};
+
+static void interpolate(const uint8_t *const input, int inlength,
+                        uint8_t *output, int outlength) {
+  const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) /
+      outlength;
+  const int64_t offset = inlength > outlength ?
+      (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength :
+      -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int64_t y;
+
+  x = 0;
+  y = offset;
+  while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = outlength - 1;
+  y = delta * x + offset;
+  while ((y >> INTERP_PRECISION_BITS) +
+         (int64_t)(INTERP_TAPS / 2) >= inlength) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = vp9_filteredinterp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+        sum += filter[k] * input[(pk < 0 ? 0 :
+                                  (pk >= inlength ? inlength - 1 : pk))];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset; x < x1; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = vp9_filteredinterp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ?
+                                  0 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = vp9_filteredinterp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < outlength; ++x, y += delta) {
+      const int16_t *filter;
+      int_pel = y >> INTERP_PRECISION_BITS;
+      sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+      filter = vp9_filteredinterp_filters[sub_pel];
+      sum = 0;
+      for (k = 0; k < INTERP_TAPS; ++k)
+        sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >=
+                                  inlength ?  inlength - 1 :
+                                  int_pel - INTERP_TAPS / 2 + 1 + k)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = vp9_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] +
+                input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = vp9_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  int s;
+  for (s = 0; s < steps; ++s)
+    length = (length + 1) >> 1;
+  return length;
+}
+
+int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input,
+                             int length,
+                             uint8_t *output,
+                             int olength,
+                             uint8_t *buf) {
+  int steps;
+  if (length == olength) {
+    memcpy(output, input, sizeof(uint8_t) * length);
+    return;
+  }
+  steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    int s;
+    uint8_t *out = NULL;
+    uint8_t *tmpbuf = NULL;
+    uint8_t *otmp, *otmp2;
+    int filteredlength = length;
+    if (!tmpbuf) {
+      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
+      otmp = tmpbuf;
+    } else {
+      otmp = buf;
+    }
+    otmp2 = otmp + get_down2_length(length, 1);
+    for (s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+    if (tmpbuf)
+      free(tmpbuf);
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
+                                      (width < height ? height : width));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width,
+                        intbuf + width2 * i, width2, tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+  }
+  free(intbuf);
+  free(tmpbuf);
+  free(arrbuf);
+}
+
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height / 2, width / 2, uv_stride,
+                   ou, oheight / 2, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height / 2, width / 2, uv_stride,
+                   ov, oheight / 2, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width / 2, uv_stride,
+                   ou, oheight, owidth / 2, ouv_stride);
+  vp9_resize_plane(v, height, width / 2, uv_stride,
+                   ov, oheight, owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride,
+                         int height, int width,
+                         uint8_t *oy, int oy_stride,
+                         uint8_t *ou, uint8_t *ov, int ouv_stride,
+                         int oheight, int owidth) {
+  vp9_resize_plane(y, height, width, y_stride,
+                   oy, oheight, owidth, oy_stride);
+  vp9_resize_plane(u, height, width, uv_stride,
+                   ou, oheight, owidth, ouv_stride);
+  vp9_resize_plane(v, height, width, uv_stride,
+                   ov, oheight, owidth, ouv_stride);
+}
diff --git a/vp9/encoder/vp9_resize.h b/vp9/encoder/vp9_resize.h
new file mode 100644
index 000000000..c67595a3f
--- /dev/null
+++ b/vp9/encoder/vp9_resize.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_RESIZE_H_
+#define VP9_ENCODER_VP9_RESIZE_H_
+
+#include <stdio.h>
+
+void vp9_resize_plane(const uint8_t *const input,
+                      int height,
+                      int width,
+                      int in_stride,
+                      uint8_t *output,
+                      int height2,
+                      int width2,
+                      int out_stride);
+void vp9_resize_frame420(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame422(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+void vp9_resize_frame444(const uint8_t *const y,
+                         int y_stride,
+                         const uint8_t *const u,
+                         const uint8_t *const v,
+                         int uv_stride,
+                         int height,
+                         int width,
+                         uint8_t *oy,
+                         int oy_stride,
+                         uint8_t *ou,
+                         uint8_t *ov,
+                         int ouv_stride,
+                         int oheight,
+                         int owidth);
+
+#endif    // VP9_ENCODER_VP9_RESIZE_H_
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 55d595baf..58c5df47e 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -8,31 +8,44 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <stdlib.h>
+
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
-#include "vp9/encoder/vp9_sadmxn.h"
-#include "vp9/encoder/vp9_variance.h"
+
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_variance.h"
+
+static INLINE unsigned int sad(const uint8_t *a, int a_stride,
+                               const uint8_t *b, int b_stride,
+                               int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return sad;
+}
 
 #define sad_mxn_func(m, n) \
-unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
-                                  int  src_stride, \
-                                  const uint8_t *ref_ptr, \
-                                  int  ref_stride, \
+unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \
+                                  const uint8_t *ref_ptr, int ref_stride, \
                                   unsigned int max_sad) { \
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+  return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
 } \
-unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \
-                                      int  src_stride, \
-                                      const uint8_t *ref_ptr, \
-                                      int  ref_stride, \
+unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, int src_stride, \
+                                      const uint8_t *ref_ptr, int ref_stride, \
                                       const uint8_t *second_pred, \
                                       unsigned int max_sad) { \
   uint8_t comp_pred[m * n]; \
   comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
-  return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \
+  return sad(src_ptr, src_stride, comp_pred, m, m, n); \
 }
 
 sad_mxn_func(64, 64)
@@ -49,567 +62,263 @@ sad_mxn_func(8, 4)
 sad_mxn_func(4, 8)
 sad_mxn_func(4, 4)
 
-void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad64x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad64x32(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x64(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad32x64(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x16(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad32x16(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad16x32(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad64x64x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad64x64x3_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride,
-                              0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad32x32x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad32x32x3_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad64x64x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad32x32x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x16x3_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad16x16x3_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x16x8_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t *ref_ptr,
-                      int  ref_stride,
+void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr, int ref_stride,
                       uint32_t *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr, ref_stride,
-                              0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 1, ref_stride,
-                              0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 2, ref_stride,
-                              0x7fffffff);
-  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 3, ref_stride,
-                              0x7fffffff);
-  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 4, ref_stride,
-                              0x7fffffff);
-  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 5, ref_stride,
-                              0x7fffffff);
-  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 6, ref_stride,
-                              0x7fffffff);
-  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr + 7, ref_stride,
-                              0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x8x3_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
+void vp9_sad16x8x3_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad16x8x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
+void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
                      uint32_t *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad8x8x3_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
+void vp9_sad8x8x3_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad8x8x8_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
+void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr, ref_stride,
-                            0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride,
-                            0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride,
-                            0x7fffffff);
-  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 3, ref_stride,
-                            0x7fffffff);
-  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 4, ref_stride,
-                            0x7fffffff);
-  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 5, ref_stride,
-                            0x7fffffff);
-  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 6, ref_stride,
-                            0x7fffffff);
-  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr + 7, ref_stride,
-                            0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad8x16x3_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
+void vp9_sad8x16x3_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad8x16x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
+void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride,
                      uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad4x4x3_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
+void vp9_sad4x4x3_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
                     unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr, ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 3; ++i)
+    sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad4x4x8_c(const uint8_t *src_ptr,
-                    int  src_stride,
-                    const uint8_t *ref_ptr,
-                    int  ref_stride,
+void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr, ref_stride,
-                            0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 1, ref_stride,
-                            0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 2, ref_stride,
-                            0x7fffffff);
-  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 3, ref_stride,
-                            0x7fffffff);
-  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 4, ref_stride,
-                            0x7fffffff);
-  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 5, ref_stride,
-                            0x7fffffff);
-  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 6, ref_stride,
-                            0x7fffffff);
-  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr + 7, ref_stride,
-                            0x7fffffff);
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
-                       int  src_stride,
-                       const uint8_t* const ref_ptr[],
-                       int  ref_stride,
+void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t* const ref_ptr[], int ref_stride,
                        unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
-                              ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                                0x7fffffff);
 }
 
-void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t* const ref_ptr[],
-                      int  ref_stride,
+void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t* const ref_ptr[], int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
-                             ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
+void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t* const ref_ptr[], int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
-                      int  src_stride,
-                      const uint8_t* const ref_ptr[],
-                      int  ref_stride,
+void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t* const ref_ptr[], int ref_stride,
                       unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
-                             ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                               0x7fffffff);
 }
 
-void vp9_sad8x4x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
+void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t* const ref_ptr[], int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad8x4x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad8x4(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
+void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
+                    uint32_t *sad_array) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
+void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t* const ref_ptr[], int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad4x8x8_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t *ref_ptr,
-                     int  ref_stride,
-                     uint32_t *sad_array) {
-  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr, ref_stride,
-                             0x7fffffff);
-  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 1, ref_stride,
-                             0x7fffffff);
-  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 2, ref_stride,
-                             0x7fffffff);
-  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 3, ref_stride,
-                             0x7fffffff);
-  sad_array[4] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 4, ref_stride,
-                             0x7fffffff);
-  sad_array[5] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 5, ref_stride,
-                             0x7fffffff);
-  sad_array[6] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 6, ref_stride,
-                             0x7fffffff);
-  sad_array[7] = vp9_sad4x8(src_ptr, src_stride,
-                             ref_ptr + 7, ref_stride,
-                             0x7fffffff);
+void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride,
+                    const uint8_t *ref_ptr, int ref_stride,
+                    uint32_t *sad_array) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr + i, ref_stride,
+                              0x7fffffff);
 }
 
-void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
-                     int  src_stride,
-                     const uint8_t* const ref_ptr[],
-                     int  ref_stride,
+void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t* const ref_ptr[], int ref_stride,
                      unsigned int *sad_array) {
-  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[0], ref_stride, 0x7fffffff);
-  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[1], ref_stride, 0x7fffffff);
-  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[2], ref_stride, 0x7fffffff);
-  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
-                            ref_ptr[3], ref_stride, 0x7fffffff);
+  int i;
+  for (i = 0; i < 4; ++i)
+    sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr[i], ref_stride,
+                              0x7fffffff);
 }
diff --git a/vp9/encoder/vp9_sadmxn.h b/vp9/encoder/vp9_sadmxn.h
deleted file mode 100644
index 1bae4dd67..000000000
--- a/vp9/encoder/vp9_sadmxn.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_SADMXN_H_
-#define VP9_ENCODER_VP9_SADMXN_H_
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr,
-                                      int src_stride,
-                                      const uint8_t *ref_ptr,
-                                      int ref_stride,
-                                      int m,
-                                      int n) {
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < n; r++) {
-    for (c = 0; c < m; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-#endif  // VP9_ENCODER_VP9_SADMXN_H_
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 876219268..c2eea0aaa 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -227,7 +227,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
 #if ALT_REF_MC_ENABLED
     // Source frames are extended to 16 pixels.  This is different than
-    //  L/A/G reference frames that have a border of 32 (VP9BORDERINPIXELS)
+    //  L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
     // A 6/8 tap filter is used for motion search.  This requires 2 pixels
     //  before and 3 pixels after.  So the largest Y mv on a border would
     //  then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 478b45ac0..b1c029cba 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,9 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-ifeq ($(ARCH_X86_64),yes)
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
-endif
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5c14b2e40..897ecd702 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -513,10 +513,8 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) {
     priv->vp8_cfg = extracfg_map[i].cfg;
     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
 
-    // TODO(agrange) Check the limits set on this buffer, or the check that is
-    // applied in vp9e_encode.
+    // Maximum buffer size approximated based on having multiple ARF.
     priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
-//    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
 
     if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
 
@@ -692,7 +690,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
     }
   }
 
-  /* Initialize the encoder instance on the first frame*/
+  /* Initialize the encoder instance on the first frame. */
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
     YV12_BUFFER_CONFIG sd;
@@ -704,9 +702,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
       ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
 
-    // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
-    //    ((VP9_COMP *)ctx->cpi)->output_partition = 1;
-
     /* Convert API flags to internal codec lib flags */
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index ed0122c1b..58256b22b 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -29,7 +29,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
     img->fmt = VPX_IMG_FMT_I420;
   }
   img->w = yv12->y_stride;
-  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3);
+  img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
   img->d_w = yv12->y_crop_width;
   img->d_h = yv12->y_crop_height;
   img->x_chroma_shift = yv12->uv_width < yv12->y_width;
@@ -75,7 +75,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
 #if CONFIG_ALPHA
-  // For development purposes, force alpha to hold the same data a Y for now.
+  // For development purposes, force alpha to hold the same data as Y for now.
   yv12->alpha_buffer = yv12->y_buffer;
   yv12->alpha_width = yv12->y_width;
   yv12->alpha_height = yv12->y_height;
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9ea0f549f..63003b9c2 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -44,7 +44,6 @@ VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.h
-VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
 VP9_CX_SRCS-yes += encoder/vp9_variance.h
@@ -62,6 +61,8 @@ VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_subexp.c
 VP9_CX_SRCS-yes += encoder/vp9_subexp.h
+VP9_CX_SRCS-yes += encoder/vp9_resize.c
+VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index bf5fc0779..610e7d280 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -18,10 +18,11 @@ extern "C" {
 #include "vpx/vpx_external_frame_buffer.h"
 #include "vpx/vpx_integer.h"
 
-#define VP8BORDERINPIXELS       32
-#define VP9INNERBORDERINPIXELS  96
-#define VP9BORDERINPIXELS      160
-#define VP9_INTERP_EXTEND        4
+#define VP8BORDERINPIXELS           32
+#define VP9INNERBORDERINPIXELS      96
+#define VP9_INTERP_EXTEND           4
+#define VP9_ENC_BORDER_IN_PIXELS    160
+#define VP9_DEC_BORDER_IN_PIXELS    32
 
   typedef struct yv12_buffer_config {
     int   y_width;
diff --git a/vpxdec.c b/vpxdec.c
index 420497914..fc344a162 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -33,6 +33,7 @@
 
 #include "./tools_common.h"
 #include "./webmdec.h"
+#include "./y4menc.h"
 
 static const char *exec_name;
 
@@ -131,6 +132,21 @@ static const arg_def_t *vp8_pp_args[] = {
 };
 #endif
 
+static int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+                           FilterMode mode) {
+  assert(src->fmt == VPX_IMG_FMT_I420);
+  assert(dst->fmt == VPX_IMG_FMT_I420);
+  return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
+                   src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U],
+                   src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V],
+                   src->d_w, src->d_h,
+                   dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y],
+                   dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U],
+                   dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V],
+                   dst->d_w, dst->d_h,
+                   mode);
+}
+
 void usage_exit() {
   int i;
 
@@ -229,47 +245,51 @@ static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
   }
 }
 
-void *out_open(const char *out_fn, int do_md5) {
-  void *out = NULL;
+static int get_image_plane_width(int plane, const vpx_image_t *img) {
+  return (plane > 0 && img->x_chroma_shift > 0) ?
+             (img->d_w + 1) >> img->x_chroma_shift :
+             img->d_w;
+}
 
-  if (do_md5) {
-    MD5Context *md5_ctx = out = malloc(sizeof(MD5Context));
-    (void)out_fn;
-    MD5Init(md5_ctx);
-  } else {
-    FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb")
-                          : set_binary_mode(stdout);
+static int get_image_plane_height(int plane, const vpx_image_t *img) {
+  return (plane > 0 &&  img->y_chroma_shift > 0) ?
+             (img->d_h + 1) >> img->y_chroma_shift :
+             img->d_h;
+}
 
-    if (!outfile) {
-      fatal("Failed to output file");
-    }
-  }
+static void update_image_md5(const vpx_image_t *img, const int planes[3],
+                             MD5Context *md5) {
+  int i, y;
 
-  return out;
-}
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = get_image_plane_width(plane, img);
+    const int h = get_image_plane_height(plane, img);
 
-void out_put(void *out, const uint8_t *buf, unsigned int len, int do_md5) {
-  if (do_md5) {
-    MD5Update(out, buf, len);
-  } else {
-    (void) fwrite(buf, 1, len, out);
+    for (y = 0; y < h; ++y) {
+      MD5Update(md5, buf, w);
+      buf += stride;
+    }
   }
 }
 
-void out_close(void *out, const char *out_fn, int do_md5) {
-  if (do_md5) {
-    uint8_t md5[16];
-    int i;
+static void write_image_file(const vpx_image_t *img, const int planes[3],
+                             FILE *file) {
+  int i, y;
 
-    MD5Final(md5, out);
-    free(out);
+  for (i = 0; i < 3; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = get_image_plane_width(plane, img);
+    const int h = get_image_plane_height(plane, img);
 
-    for (i = 0; i < 16; i++)
-      printf("%02x", md5[i]);
-
-    printf("  %s\n", out_fn);
-  } else {
-    fclose(out);
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
   }
 }
 
@@ -413,6 +433,39 @@ void generate_filename(const char *pattern, char *out, size_t q_len,
   } while (*p);
 }
 
+static int is_single_file(const char *outfile_pattern) {
+  const char *p = outfile_pattern;
+
+  do {
+    p = strchr(p, '%');
+    if (p && p[1] >= '1' && p[1] <= '9')
+      return 0;  // pattern contains sequence number, so it's not unique
+    if (p)
+      p++;
+  } while (p);
+
+  return 1;
+}
+
+static void print_md5(unsigned char digest[16], const char *filename) {
+  int i;
+
+  for (i = 0; i < 16; ++i)
+    printf("%02x", digest[i]);
+  printf("  %s\n", filename);
+}
+
+static FILE *open_outfile(const char *name) {
+  if (strcmp("-", name) == 0) {
+    set_binary_mode(stdout);
+    return stdout;
+  } else {
+    FILE *file = fopen(name, "wb");
+    if (!file)
+      fatal("Failed to output file %s", name);
+    return file;
+  }
+}
 
 int main_loop(int argc, const char **argv_) {
   vpx_codec_ctx_t       decoder;
@@ -430,11 +483,9 @@ int main_loop(int argc, const char **argv_) {
   unsigned long          dx_time = 0;
   struct arg               arg;
   char                   **argv, **argi, **argj;
-  const char             *outfile_pattern = 0;
-  char                    outfile[PATH_MAX];
+
   int                     single_file;
   int                     use_y4m = 1;
-  void                   *out = NULL;
   vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
@@ -451,8 +502,13 @@ int main_loop(int argc, const char **argv_) {
   int                     num_external_frame_buffers = 0;
   int                     fb_lru_cache = 0;
   vpx_codec_frame_buffer_t *frame_buffers = NULL;
-  int                     display_width = 0;
-  int                     display_height = 0;
+
+  const char *outfile_pattern = NULL;
+  char outfile_name[PATH_MAX] = {0};
+  FILE *outfile = NULL;
+
+  MD5Context md5_ctx;
+  unsigned char md5_digest[16];
 
   struct VpxDecInputContext input = {0};
   struct VpxInputContext vpx_input_ctx = {0};
@@ -588,8 +644,7 @@ int main_loop(int argc, const char **argv_) {
   infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
 
   if (!infile) {
-    fprintf(stderr, "Failed to open file '%s'",
-            strcmp(fn, "-") ? fn : "stdin");
+    fprintf(stderr, "Failed to open file '%s'", strcmp(fn, "-") ? fn : "stdin");
     return EXIT_FAILURE;
   }
 #if CONFIG_OS_SUPPORT
@@ -613,58 +668,32 @@ int main_loop(int argc, const char **argv_) {
     return EXIT_FAILURE;
   }
 
-  /* If the output file is not set or doesn't have a sequence number in
-   * it, then we only open it once.
-   */
   outfile_pattern = outfile_pattern ? outfile_pattern : "-";
-  single_file = 1;
-  {
-    const char *p = outfile_pattern;
-    do {
-      p = strchr(p, '%');
-      if (p && p[1] >= '1' && p[1] <= '9') {
-        /* pattern contains sequence number, so it's not unique. */
-        single_file = 0;
-        break;
-      }
-      if (p)
-        p++;
-    } while (p);
-  }
+  single_file = is_single_file(outfile_pattern);
 
-  if (single_file && !noblit) {
-    generate_filename(outfile_pattern, outfile, sizeof(outfile) - 1,
+  if (!noblit && single_file) {
+    generate_filename(outfile_pattern, outfile_name, PATH_MAX,
                       vpx_input_ctx.width, vpx_input_ctx.height, 0);
-    out = out_open(outfile, do_md5);
+    if (do_md5)
+      MD5Init(&md5_ctx);
+    else
+      outfile = open_outfile(outfile_name);
   }
 
   if (use_y4m && !noblit) {
-    char buffer[128];
-
     if (!single_file) {
       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
               " try --i420 or --yv12.\n");
       return EXIT_FAILURE;
     }
 
-    if (vpx_input_ctx.file_type == FILE_TYPE_WEBM)
+    if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) {
       if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) {
         fprintf(stderr, "Failed to guess framerate -- error parsing "
                 "webm file?\n");
         return EXIT_FAILURE;
       }
-
-
-    /*Note: We can't output an aspect ratio here because IVF doesn't
-       store one, and neither does VP8.
-      That will have to wait until these tools support WebM natively.*/
-    snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",
-             vpx_input_ctx.width, vpx_input_ctx.height,
-             vpx_input_ctx.framerate.numerator,
-             vpx_input_ctx.framerate.denominator,
-             'p');
-    out_put(out, (unsigned char *)buffer,
-            (unsigned int)strlen(buffer), do_md5);
+    }
   }
 
   /* Try to determine the codec from the fourcc. */
@@ -811,25 +840,20 @@ int main_loop(int argc, const char **argv_) {
       show_progress(frame_in, frame_out, dx_time);
 
     if (!noblit) {
-      if (frame_out == 1 && img && use_y4m) {
-        /* Write out the color format to terminate the header line */
-        const char *color =
-            img->fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
-            img->fmt == VPX_IMG_FMT_I444 ? "C444\n" :
-            img->fmt == VPX_IMG_FMT_I422 ? "C422\n" :
-            "C420jpeg\n";
-
-        out_put(out, (const unsigned char*)color, strlen(color), do_md5);
-      }
+      if (frame_out == 1 && img && single_file && !do_md5 && use_y4m)
+        y4m_write_file_header(outfile,
+                              vpx_input_ctx.width, vpx_input_ctx.height,
+                              &vpx_input_ctx.framerate, img->fmt);
 
       if (img && do_scale) {
         if (frame_out == 1) {
           // If the output frames are to be scaled to a fixed display size then
           // use the width and height specified in the container. If either of
           // these is set to 0, use the display size set in the first frame
-          // header.
-          display_width = vpx_input_ctx.width;
-          display_height = vpx_input_ctx.height;
+          // header. If that is unavailable, use the raw decoded size of the
+          // first decoded frame.
+          int display_width = vpx_input_ctx.width;
+          int display_height = vpx_input_ctx.height;
           if (!display_width || !display_height) {
             int display_size[2];
             if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE,
@@ -846,67 +870,40 @@ int main_loop(int argc, const char **argv_) {
                                      display_height, 16);
         }
 
-        if (img->d_w != display_width || img->d_h != display_height) {
-          assert(img->fmt == VPX_IMG_FMT_I420);
-          I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
-                    img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
-                    img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
-                    img->d_w, img->d_h,
-                    scaled_img->planes[VPX_PLANE_Y],
-                    scaled_img->stride[VPX_PLANE_Y],
-                    scaled_img->planes[VPX_PLANE_U],
-                    scaled_img->stride[VPX_PLANE_U],
-                    scaled_img->planes[VPX_PLANE_V],
-                    scaled_img->stride[VPX_PLANE_V],
-                    display_width, display_height,
-                    kFilterBox);
+        if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+          vpx_image_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
         }
       }
+
       if (img) {
-        unsigned int y;
-        char out_fn[PATH_MAX];
-        uint8_t *buf;
-        unsigned int c_w =
-            img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift
-                                : img->d_w;
-        unsigned int c_h =
-            img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift
-                                : img->d_h;
+        const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V};
+        const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U};
 
-        if (!single_file) {
-          size_t len = sizeof(out_fn) - 1;
+        const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
 
-          out_fn[len] = '\0';
-          generate_filename(outfile_pattern, out_fn, len - 1,
+        if (!single_file) {
+          generate_filename(outfile_pattern, outfile_name, PATH_MAX,
                             img->d_w, img->d_h, frame_in);
-          out = out_open(out_fn, do_md5);
-        } else if (use_y4m)
-          out_put(out, (unsigned char *)"FRAME\n", 6, do_md5);
-
-        buf = img->planes[VPX_PLANE_Y];
-
-        for (y = 0; y < img->d_h; y++) {
-          out_put(out, buf, img->d_w, do_md5);
-          buf += img->stride[VPX_PLANE_Y];
-        }
-
-        buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];
-
-        for (y = 0; y < c_h; y++) {
-          out_put(out, buf, c_w, do_md5);
-          buf += img->stride[VPX_PLANE_U];
-        }
-
-        buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];
-
-        for (y = 0; y < c_h; y++) {
-          out_put(out, buf, c_w, do_md5);
-          buf += img->stride[VPX_PLANE_V];
+          if (do_md5) {
+            MD5Init(&md5_ctx);
+            update_image_md5(img, planes, &md5_ctx);
+            MD5Final(md5_digest, &md5_ctx);
+            print_md5(md5_digest, outfile_name);
+          } else {
+            outfile = open_outfile(outfile_name);
+            write_image_file(img, planes, outfile);
+            fclose(outfile);
+          }
+        } else {
+          if (do_md5) {
+            update_image_md5(img, planes, &md5_ctx);
+          } else {
+            if (use_y4m)
+              y4m_write_frame_header(outfile);
+            write_image_file(img, planes, outfile);
+          }
         }
-
-        if (!single_file)
-          out_close(out, out_fn, do_md5);
       }
     }
 
@@ -930,8 +927,14 @@ fail:
     return EXIT_FAILURE;
   }
 
-  if (single_file && !noblit)
-    out_close(out, outfile, do_md5);
+  if (!noblit && single_file) {
+    if (do_md5) {
+      MD5Final(md5_digest, &md5_ctx);
+      print_md5(md5_digest, outfile_name);
+    } else {
+      fclose(outfile);
+    }
+  }
 
   if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM)
     webm_free(input.webm_ctx);
diff --git a/vpxenc.c b/vpxenc.c
index 396e43dc9..f19300acf 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -833,8 +833,8 @@ static int compare_img(vpx_image_t *img1, vpx_image_t *img2)
   unsigned int i;
 
   match &= (img1->fmt == img2->fmt);
-  match &= (img1->w == img2->w);
-  match &= (img1->h == img2->h);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
 
   for (i = 0; i < img1->d_h; i++)
     match &= (memcmp(img1->planes[VPX_PLANE_Y]+i*img1->stride[VPX_PLANE_Y],
diff --git a/y4menc.c b/y4menc.c
new file mode 100644
index 000000000..8321b432e
--- /dev/null
+++ b/y4menc.c
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./y4menc.h"
+
+void y4m_write_file_header(FILE *file, int width, int height,
+                           const struct VpxRational *framerate,
+                           vpx_img_fmt_t fmt) {
+  const char *color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+                      fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+                      fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+                      "C420jpeg\n";
+
+  // Note: We can't output an aspect ratio here because IVF doesn't
+  // store one, and neither does VP8.
+  // That will have to wait until these tools support WebM natively.*/
+  fprintf(file, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,
+          framerate->numerator, framerate->denominator, 'p', color);
+}
+
+void y4m_write_frame_header(FILE *file) {
+  fprintf(file, "FRAME\n");
+}
diff --git a/y4menc.h b/y4menc.h
new file mode 100644
index 000000000..e5f7978a7
--- /dev/null
+++ b/y4menc.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef Y4MENC_H_
+#define Y4MENC_H_
+
+#include <stdio.h>
+
+#include "./tools_common.h"
+
+#include "vpx/vpx_decoder.h"
+
+void y4m_write_file_header(FILE *file, int width, int height,
+                           const struct VpxRational *framerate,
+                           vpx_img_fmt_t fmt);
+
+void y4m_write_frame_header(FILE *file);
+
+
+#endif  // Y4MENC_H_