135 files changed, 1917 insertions, 1822 deletions
diff --git a/build/make/ads2armasm_ms.pl b/build/make/ads2armasm_ms.pl
index 1def53901..95c808403 100755
--- a/build/make/ads2armasm_ms.pl
+++ b/build/make/ads2armasm_ms.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 ##
 ##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 ##
diff --git a/build/make/configure.sh b/build/make/configure.sh
index b5151da13..c379c74a9 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1199,8 +1199,8 @@ EOF
     fi
 
     # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
-    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
-    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}"  ]; then
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o \
+         "${tgt_os#darwin}" = "${tgt_os}"  ]; then
       soft_enable use_x86inc
     fi
 
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 7df0334c6..5936370a7 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -371,7 +371,7 @@ generate_vcproj() {
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -412,7 +412,6 @@ generate_vcproj() {
                             obj_int_extract)
                                 tag Tool \
                                     Name="VCLinkerTool" \
-                                    OutputFile="${name}.exe" \
                                     GenerateDebugInformation="true" \
                             ;;
                             *)
@@ -479,7 +478,7 @@ generate_vcproj() {
                     vpx)
                         tag Tool \
                             Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat $src_path_bare" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
 
                         tag Tool \
                             Name="VCCLCompilerTool" \
@@ -522,7 +521,6 @@ generate_vcproj() {
                             obj_int_extract)
                                 tag Tool \
                                     Name="VCLinkerTool" \
-                                    OutputFile="${name}.exe" \
                                     GenerateDebugInformation="true" \
                             ;;
                             *)
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index ba9e83a7c..4558aa1c1 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -156,6 +156,10 @@ generate_filter() {
                 objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,/,_,g')
 
                 if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                    # Avoid object file name collisions, i.e. vpx_config.c and
+                    # vpx_config.asm produce the same object file without
+                    # this additional suffix.
+                    objf=${objf%.obj}_asm.obj
                     open_tag CustomBuild \
                         Include=".\\$f"
                     for plat in "${platforms[@]}"; do
@@ -430,6 +434,14 @@ generate_vcxproj() {
                 Condition="'\$(Configuration)|\$(Platform)'=='$config|$plat'"
             tag_content OutDir "\$(SolutionDir)$plat_no_ws\\\$(Configuration)\\"
             tag_content IntDir "$plat_no_ws\\\$(Configuration)\\${name}\\"
+            if [ "$proj_kind" == "lib" ]; then
+              if [ "$config" == "Debug" ]; then
+                config_suffix=d
+              else
+                config_suffix=""
+              fi
+              tag_content TargetName "${name}${lib_sfx}${config_suffix}"
+            fi
             close_tag PropertyGroup
         done
     done
@@ -452,7 +464,6 @@ generate_vcxproj() {
                 opt=Disabled
                 runtime=$debug_runtime
                 curlibs=$debug_libs
-                confsuffix=d
                 case "$name" in
                 obj_int_extract)
                     debug=DEBUG
@@ -465,7 +476,6 @@ generate_vcxproj() {
                 opt=MaxSpeed
                 runtime=$release_runtime
                 curlibs=$libs
-                confsuffix=""
                 tag_content FavorSizeOrSpeed Speed
                 debug=NDEBUG
             fi
@@ -501,9 +511,6 @@ generate_vcxproj() {
                 close_tag Link
                 ;;
             lib)
-                open_tag Lib
-                tag_content OutputFile "\$(OutDir)${name}${lib_sfx}${confsuffix}.lib"
-                close_tag Lib
                 ;;
             esac
             close_tag ItemDefinitionGroup
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index ed037132a..93c9adcac 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -333,6 +333,7 @@ EOF
 #
 # Main Driver
 #
+ALL_FUNCS=$(export LC_ALL=C; echo $ALL_FUNCS | tr ' ' '\n' | sort |tr '\n' ' ')
 require c
 case $arch in
   x86)
diff --git a/build/make/thumb.pm b/build/make/thumb.pm
index d8d04aa85..9604c8e32 100644
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 ##
 ##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 ##
diff --git a/examples.mk b/examples.mk
index 13dbcb6f1..85b84577c 100644
--- a/examples.mk
+++ b/examples.mk
@@ -19,6 +19,8 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
 # while EXAMPLES demonstrate specific portions of the API.
 UTILS-$(CONFIG_DECODERS)    += vpxdec.c
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
+vpxdec.SRCS                 += vpx_ports/mem_ops.h
+vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
@@ -26,13 +28,13 @@ vpxdec.SRCS                 += ivfdec.c ivfdec.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += webmdec.c webmdec.h
 vpxdec.SRCS                 += y4menc.c y4menc.h
-vpxdec.SRCS                 += nestegg/halloc/halloc.h
-vpxdec.SRCS                 += nestegg/halloc/src/align.h
-vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
-vpxdec.SRCS                 += nestegg/halloc/src/hlist.h
-vpxdec.SRCS                 += nestegg/halloc/src/macros.h
-vpxdec.SRCS                 += nestegg/include/nestegg/nestegg.h
-vpxdec.SRCS                 += nestegg/src/nestegg.c
+vpxdec.SRCS                 += third_party/nestegg/halloc/halloc.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/align.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/halloc.c
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/hlist.h
+vpxdec.SRCS                 += third_party/nestegg/halloc/src/macros.h
+vpxdec.SRCS                 += third_party/nestegg/include/nestegg/nestegg.h
+vpxdec.SRCS                 += third_party/nestegg/src/nestegg.c
 vpxdec.SRCS                 += $(LIBYUV_SRCS)
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
@@ -85,12 +87,16 @@ simple_decoder.SRCS                += ivfdec.h ivfdec.c
 simple_decoder.SRCS                += tools_common.h tools_common.c
 simple_decoder.SRCS                += video_common.h
 simple_decoder.SRCS                += video_reader.h video_reader.c
+simple_decoder.SRCS                += vpx_ports/mem_ops.h
+simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_VP8_DECODER)     += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
+postproc.SRCS                      += vpx_ports/mem_ops.h
+postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_VP8_DECODER)     += decode_to_md5.c
@@ -99,6 +105,8 @@ decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
 decode_to_md5.SRCS                 += video_common.h
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
+decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
+decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_VP8_ENCODER)  += simple_encoder.c
@@ -124,6 +132,8 @@ decode_with_drops.SRCS          += ivfdec.h ivfdec.c
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
+decode_with_drops.SRCS          += vpx_ports/mem_ops.h
+decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
 endif
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 4c29056e5..1fef7dbb8 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -18,11 +18,12 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string.h>
-#include "math.h"
+#include <math.h>
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
 #include "vpx_ports/mem_ops.h"
+#include "./tools_common.h"
 #define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056
 
@@ -44,21 +45,6 @@
 #include "third_party/libyuv/include/libyuv/scale.h"
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 
-static double vp8_mse2psnr(double Samples, double Peak, double Mse)
-{
-    double psnr;
-
-    if ((double)Mse > 0.0)
-        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-    else
-        psnr = 60;      // Limit to prevent / 0
-
-    if (psnr > 60)
-        psnr = 60;
-
-    return psnr;
-}
-
 static void die(const char *fmt, ...) {
     va_list ap;
 
@@ -454,8 +440,8 @@ int main(int argc, char **argv)
         if ( (show_psnr) && (psnr_count[i]>0) )
         {
             int j;
-            double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0,
-                                         psnr_sse_total[i]);
+            double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+                                        psnr_sse_total[i]);
 
             fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
 
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_scalable_encoder.c
index 9f526b0a4..98dc3f5f9 100644
--- a/examples/vp9_spatial_scalable_encoder.c
+++ b/examples/vp9_spatial_scalable_encoder.c
@@ -54,12 +54,18 @@ static const arg_def_t kf_dist_arg =
 static const arg_def_t scale_factors_arg =
     ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
 static const arg_def_t quantizers_arg =
-    ARG_DEF("q", "quantizers", 1, "quantizers (lowest to highest layer)");
+    ARG_DEF("q", "quantizers", 1, "quantizers for non key frames, also will "
+            "be applied to key frames if -qn is not specified (lowest to "
+            "highest layer)");
+static const arg_def_t quantizers_keyframe_arg =
+    ARG_DEF("qn", "quantizers-keyframe", 1, "quantizers for key frames (lowest "
+        "to highest layer)");
 
 static const arg_def_t *svc_args[] = {
   &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
-  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  NULL
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,
+  &quantizers_keyframe_arg, NULL
 };
 
 static const SVC_ENCODING_MODE default_encoding_mode =
@@ -150,7 +156,9 @@ static void parse_command_line(int argc, const char **argv_,
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       vpx_svc_set_scale_factors(svc_ctx, arg.val);
     } else if (arg_match(&arg, &quantizers_arg, argi)) {
-      vpx_svc_set_quantizers(svc_ctx, arg.val);
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 0);
+    } else if (arg_match(&arg, &quantizers_keyframe_arg, argi)) {
+      vpx_svc_set_quantizers(svc_ctx, arg.val, 1);
     } else {
       ++argj;
     }
diff --git a/examples/vpx_temporal_scalable_patterns.c b/examples/vpx_temporal_scalable_patterns.c
index 32e88e33f..6ec1b6208 100644
--- a/examples/vpx_temporal_scalable_patterns.c
+++ b/examples/vpx_temporal_scalable_patterns.c
@@ -52,6 +52,12 @@ struct RateControlMetrics {
   double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
 };
 
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
 static void set_rate_control_metrics(struct RateControlMetrics *rc,
                                      vpx_codec_enc_cfg_t *cfg) {
   unsigned int i = 0;
@@ -81,7 +87,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
                                           vpx_codec_enc_cfg_t *cfg,
                                           int frame_cnt) {
   unsigned int i = 0;
-  int check_num_frames = 0;
+  int tot_num_frames = 0;
   printf("Total number of processed frames: %d\n\n", frame_cnt -1);
   printf("Rate control layer stats for %d layer(s):\n\n",
       cfg->ts_number_layers);
@@ -89,8 +95,9 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     const int num_dropped = (i > 0) ?
         (rc->layer_input_frames[i] - rc->layer_enc_frames[i]) :
         (rc->layer_input_frames[i] - rc->layer_enc_frames[i] - 1);
+    tot_num_frames += rc->layer_input_frames[i];
     rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[i] *
-        rc->layer_encoding_bitrate[i] / rc->layer_tot_enc_frames[i];
+        rc->layer_encoding_bitrate[i] / tot_num_frames;
     rc->layer_avg_frame_size[i] = rc->layer_avg_frame_size[i] /
         rc->layer_enc_frames[i];
     rc->layer_avg_rate_mismatch[i] = 100.0 * rc->layer_avg_rate_mismatch[i] /
@@ -105,10 +112,9 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
         "and perc dropped frames: %d %d %f \n", rc->layer_input_frames[i],
         rc->layer_enc_frames[i],
         100.0 * num_dropped / rc->layer_input_frames[i]);
-    check_num_frames += rc->layer_input_frames[i];
     printf("\n");
   }
-  if ((frame_cnt - 1) != check_num_frames)
+  if ((frame_cnt - 1) != tot_num_frames)
     die("Error: Number of input frames not equal to output! \n");
 }
 
@@ -565,6 +571,9 @@ int main(int argc, char **argv) {
   }
   vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
   vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
+  // This controls the maximum target size of the key frame.
+  // For generating smaller key frames, use a smaller max_intra_size_pct
+  // value, like 100 or 200.
   max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
       * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
   vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
diff --git a/ivfdec.c b/ivfdec.c
index 40394a81a..6dcd66f73 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -12,6 +12,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "vpx_ports/mem_ops.h"
+
 #include "./ivfdec.h"
 
 static const char *IVF_SIGNATURE = "DKIF";
diff --git a/libs.mk b/libs.mk
index eac61f289..302d2afbd 100644
--- a/libs.mk
+++ b/libs.mk
@@ -164,6 +164,8 @@ endif
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h
+CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_once.h
 CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
diff --git a/test/cq_test.cc b/test/cq_test.cc
index a2c829163..7da7b80aa 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -20,7 +20,7 @@ namespace {
 const int kCQLevelMin = 4;
 const int kCQLevelMax = 63;
 const int kCQLevelStep = 8;
-const int kCQTargetBitrate = 2000;
+const unsigned int kCQTargetBitrate = 2000;
 
 class CQTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<int> {
@@ -66,17 +66,17 @@ class CQTest : public ::libvpx_test::EncoderTest,
     return pow(10.0, avg_psnr / 10.0) / file_size_;
   }
 
-  int file_size() const { return file_size_; }
+  size_t file_size() const { return file_size_; }
   int n_frames() const { return n_frames_; }
 
  private:
   int cq_level_;
-  int file_size_;
+  size_t file_size_;
   double psnr_;
   int n_frames_;
 };
 
-int prev_actual_bitrate = kCQTargetBitrate;
+unsigned int prev_actual_bitrate = kCQTargetBitrate;
 TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -88,7 +88,8 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
                                      timebase.den, timebase.num, 0, 30);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double cq_psnr_lin = GetLinearPSNROverBitrate();
-  const int cq_actual_bitrate = file_size() * 8 * 30 / (n_frames() * 1000);
+  const unsigned int cq_actual_bitrate =
+      static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
   EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
   EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
   prev_actual_bitrate = cq_actual_bitrate;
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 8d115fad3..cb5562ec1 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -512,6 +512,14 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct16x16_c,
+                   &vp9_idct16x16_256_add_neon, 0)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a2608acb5..013f4513e 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -248,6 +248,16 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
         make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_neon, 0),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_neon, 1)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 30c20e91f..4cd9efb86 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -16,8 +16,8 @@
 
 namespace {
 
-const int kMaxErrorFrames = 8;
-const int kMaxDroppableFrames = 8;
+const int kMaxErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
 
 class ErrorResilienceTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
@@ -175,6 +175,10 @@ TEST_P(ErrorResilienceTest, OnVersusOff) {
   }
 }
 
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+// Check both isolated and consecutive loss.
 TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -186,14 +190,18 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
   init_flags_ = VPX_CODEC_USE_PSNR;
 
   libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 30);
+                                     timebase.den, timebase.num, 0, 40);
 
   // Error resilient mode ON.
   cfg_.g_error_resilient = 1;
-
-  // Set an arbitrary set of error frames same as droppable frames
-  unsigned int num_droppable_frames = 2;
-  unsigned int droppable_frame_list[] = {5, 16};
+  cfg_.kf_mode = VPX_KF_DISABLED;
+
+  // Set an arbitrary set of error frames same as droppable frames.
+  // In addition to isolated loss/drop, add a long consecutive series
+  // (of size 9) of dropped frames.
+  unsigned int num_droppable_frames = 11;
+  unsigned int droppable_frame_list[] = {5, 16, 22, 23, 24, 25, 26, 27, 28,
+                                         29, 30};
   SetDroppableFrames(num_droppable_frames, droppable_frame_list);
   SetErrorFrames(num_droppable_frames, droppable_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -202,7 +210,7 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
             << GetMismatchFrames() << "\n";
   EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
 
-  // reset previously set error/droppable frames
+  // Reset previously set of error/droppable frames.
   Reset();
 
 #if 0
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index dc6668759..127775c9a 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -286,6 +286,21 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c,
+                   &vp9_idct4x4_16_add_neon, 0)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_NEON, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 98aabe6a2..6f2d7d1c9 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -313,6 +313,20 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_NEON, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 2a32410f3..8849ce626 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -140,6 +140,30 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialIDctTest,
+    ::testing::Values(
+        make_tuple(&vp9_idct32x32_1024_add_c,
+                   &vp9_idct32x32_1_add_neon,
+                   TX_32X32, 1),
+        make_tuple(&vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_10_add_neon,
+                   TX_16X16, 10),
+        make_tuple(&vp9_idct16x16_256_add_c,
+                   &vp9_idct16x16_1_add_neon,
+                   TX_16X16, 1),
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_10_add_neon,
+                   TX_8X8, 10),
+        make_tuple(&vp9_idct8x8_64_add_c,
+                   &vp9_idct8x8_1_add_neon,
+                   TX_8X8, 1),
+        make_tuple(&vp9_idct4x4_16_add_c,
+                   &vp9_idct4x4_1_add_neon,
+                   TX_4X4, 1)));
+#endif
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, PartialIDctTest,
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 4a91b0b60..a692891ad 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -296,6 +296,8 @@ TEST_P(SADTest, MaxSAD) {
 
 using std::tr1::make_tuple;
 
+//------------------------------------------------------------------------------
+// C functions
 #if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
 const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
@@ -364,16 +366,20 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
                         make_tuple(8, 4, sad_8x4x4d_c),
                         make_tuple(4, 8, sad_4x8x4d_c),
                         make_tuple(4, 4, sad_4x4x4d_c)));
-#endif
+#endif  // CONFIG_VP9_ENCODER
 
-// ARM tests
+//------------------------------------------------------------------------------
+// ARM functions
 #if HAVE_MEDIA
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_armv6)));
-
 #endif
+#endif
+
 #if HAVE_NEON
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon;
 const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon;
 const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon;
@@ -386,8 +392,10 @@ INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
                         make_tuple(8, 8, sad_8x8_neon),
                         make_tuple(4, 4, sad_4x4_neon)));
 #endif
+#endif
 
-// X86 tests
+//------------------------------------------------------------------------------
+// x86 functions
 #if HAVE_MMX
 #if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
@@ -437,9 +445,9 @@ const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                         make_tuple(4, 8, sad_4x8x4d_sse),
                         make_tuple(4, 4, sad_4x4x4d_sse)));
-#endif
-#endif
-#endif
+#endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_SSE
 
 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
@@ -537,10 +545,12 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
 #endif
 #endif
+#endif
 
 }  // namespace
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index 3434662fb..1b2f03f53 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -193,6 +193,16 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(8, 8, sixtap_8x8_c),
         make_tuple(8, 4, sixtap_8x4_c),
         make_tuple(4, 4, sixtap_4x4_c)));
+#if HAVE_NEON
+const sixtap_predict_fn_t sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
+const sixtap_predict_fn_t sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
+const sixtap_predict_fn_t sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_neon),
+        make_tuple(8, 8, sixtap_8x8_neon),
+        make_tuple(8, 4, sixtap_8x4_neon)));
+#endif
 #if HAVE_MMX
 const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
 const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 9e242a246..3efb9553c 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -105,6 +105,11 @@ TEST_P(SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_c));
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
+                        ::testing::Values(vp8_subtract_b_neon));
+#endif
+
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
                         ::testing::Values(vp8_subtract_b_mmx));
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 75659d50d..2e5653424 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -177,20 +177,48 @@ TEST_F(SvcTest, SetQuantizersOption) {
   codec_initialized_ = true;
 }
 
+TEST_F(SvcTest, SetKeyFrameQuantizersOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_,
+                                       "quantizers-keyframe=not-quantizers");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  vpx_svc_set_options(&svc_, "quantizers-keyframe=40,45");
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
 TEST_F(SvcTest, SetQuantizers) {
-  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30", 0);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_quantizers(&svc_, NULL);
+  res = vpx_svc_set_quantizers(&svc_, NULL, 0);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40");
+  res = vpx_svc_set_quantizers(&svc_, "40", 0);
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  res = vpx_svc_set_quantizers(&svc_, "40,30");
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 0);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetKeyFrameQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,31", 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL, 1);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30", 1);
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
@@ -221,7 +249,7 @@ TEST_F(SvcTest, SetScaleFactors) {
 TEST_F(SvcTest, FirstFrameHasLayers) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -248,7 +276,7 @@ TEST_F(SvcTest, FirstFrameHasLayers) {
 TEST_F(SvcTest, EncodeThreeFrames) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
@@ -301,7 +329,7 @@ TEST_F(SvcTest, EncodeThreeFrames) {
 TEST_F(SvcTest, GetLayerResolution) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
-  vpx_svc_set_quantizers(&svc_, "40,30");
+  vpx_svc_set_quantizers(&svc_, "40,30", 0);
 
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
diff --git a/test/test.mk b/test/test.mk
index cabfc678d..bf6d055ff 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,54 +1,56 @@
+LIBVPX_TEST_SRCS-yes += acm_random.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
+LIBVPX_TEST_SRCS-yes += codec_factory.h
+LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
-LIBVPX_TEST_SRCS-yes += acm_random.h
-LIBVPX_TEST_SRCS-yes += md5_helper.h
-LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.cc
+LIBVPX_TEST_SRCS-yes += test_vectors.h
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
-LIBVPX_TEST_SRCS-yes += test_vectors.h
-LIBVPX_TEST_SRCS-yes += test_vectors.cc
 
 ##
 ## BLACK BOX TESTS
 ##
 ## Black box tests only use the public API.
 ##
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc
-
-LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
-LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
-LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc
+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h
 
 ## WebM Parsing
-NESTEGG_SRCS                           += ../nestegg/halloc/halloc.h
-NESTEGG_SRCS                           += ../nestegg/halloc/src/align.h
-NESTEGG_SRCS                           += ../nestegg/halloc/src/halloc.c
-NESTEGG_SRCS                           += ../nestegg/halloc/src/hlist.h
-NESTEGG_SRCS                           += ../nestegg/include/nestegg/nestegg.h
-NESTEGG_SRCS                           += ../nestegg/src/nestegg.c
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/halloc.h
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/align.h
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/halloc.c
+NESTEGG_SRCS                           += ../third_party/nestegg/halloc/src/hlist.h
+NESTEGG_SRCS                           += ../third_party/nestegg/include/nestegg/nestegg.h
+NESTEGG_SRCS                           += ../third_party/nestegg/src/nestegg.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(NESTEGG_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
+
 # Currently we only support decoder perf tests for vp9
 ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER), yesyes)
 LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
@@ -70,18 +72,17 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc
 endif
 
-LIBVPX_TEST_SRCS-yes                   += idct_test.cc
-LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
-LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 
+LIBVPX_TEST_SRCS-yes                   += idct_test.cc
+LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
+LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+
 endif # VP8
 
 ## VP9
@@ -89,29 +90,30 @@ ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
 
 # These tests require both the encoder and decoder to be built.
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
-LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
-
 # IDCT test currently depends on FDCT function
 LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc
 LIBVPX_TEST_SRCS-yes                   += partial_idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += superframe_test.cc
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
+
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
-
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 endif # VP9
 
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc
 
-endif
+endif # CONFIG_SHARED
 
 
 ##
@@ -122,128 +124,128 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-004.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-005.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-006.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-007.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-008.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-009.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-010.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-011.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-012.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-013.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-014.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-015.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-016.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-017.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-018.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1400.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1411.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1416.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-01-intra-1417.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1402.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1412.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1418.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-02-inter-1424.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1401.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1403.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1407.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1408.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1409.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1410.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1413.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1414.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1415.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1425.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1426.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1427.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1432.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1435.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1436.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1437.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1441.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-1442.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-01.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-02.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-03.ivf.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-03-segmentation-04.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1404.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1405.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-04-partitions-1406.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1428.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1429.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1430.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1431.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1433.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1434.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
@@ -517,6 +519,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
@@ -645,38 +649,34 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-05-resize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-06-bilinear.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-07-frame_parallel.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x2_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-aq2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-lf_deltas.webm.md5
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-10-show-existing-frame.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-11-size-351x287.webm
@@ -691,6 +691,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_2.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-12-droppable_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # BBB VP9 streams
diff --git a/test/variance_test.cc b/test/variance_test.cc
index b9144ffab..c9bf13a6b 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -307,6 +307,19 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 3, variance16x8_c),
                       make_tuple(4, 4, variance16x16_c)));
 
+#if HAVE_NEON
+const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon;
+const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon;
+const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon;
+const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP8VarianceTest,
+    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
+                      make_tuple(3, 4, variance8x16_neon),
+                      make_tuple(4, 3, variance16x8_neon),
+                      make_tuple(4, 4, variance16x16_neon)));
+#endif
+
 #if HAVE_MMX
 const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
 const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index 4dcf4dc97..9d88ae360 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -14,7 +14,7 @@
 #include <cstdlib>
 #include <new>
 #include <string>
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 #include "test/video_source.h"
 
 namespace libvpx_test {
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 20d2be02b..74190432d 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -35,14 +35,11 @@ class Y4mVideoSource : public VideoSource {
 
   virtual ~Y4mVideoSource() {
     vpx_img_free(img_.get());
-    y4m_input_close(&y4m_);
-    if (input_file_)
-      fclose(input_file_);
+    CloseSource();
   }
 
   virtual void Begin() {
-    if (input_file_)
-      fclose(input_file_);
+    CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
         << file_name_;
@@ -89,6 +86,15 @@ class Y4mVideoSource : public VideoSource {
   }
 
  protected:
+  void CloseSource() {
+    y4m_input_close(&y4m_);
+    y4m_ = y4m_input();
+    if (input_file_ != NULL) {
+      fclose(input_file_);
+      input_file_ = NULL;
+    }
+  }
+
   std::string file_name_;
   FILE *input_file_;
   testing::internal::scoped_ptr<vpx_image_t> img_;
diff --git a/nestegg/0001-include-paths.diff b/third_party/nestegg/0001-include-paths.diff
index b6e07ae12..a704ebdcd 100644
--- a/nestegg/0001-include-paths.diff
+++ b/third_party/nestegg/0001-include-paths.diff
@@ -7,7 +7,7 @@ index 5758fc0..837b3ff 100644
  #include <string.h>  /* memset & co */
  
 -#include "halloc.h"
-+#include "nestegg/halloc/halloc.h"
++#include "third_party/nestegg/halloc/halloc.h"
  #include "align.h"
  #include "hlist.h"
  
@@ -34,8 +34,8 @@ index daf1eed..4fb10e7 100644
  
 -#include "halloc.h"
 -#include "nestegg/nestegg.h"
-+#include "nestegg/halloc/halloc.h"
-+#include "nestegg/include/nestegg/nestegg.h"
++#include "third_party/nestegg/halloc/halloc.h"
++#include "third_party/nestegg/include/nestegg/nestegg.h"
  
  /* EBML Elements */
  #define ID_EBML                 0x1a45dfa3
diff --git a/nestegg/0002-ne_read_simple-uninitialized_variable.diff b/third_party/nestegg/0002-ne_read_simple-uninitialized_variable.diff
index c3bc9e575..c3bc9e575 100644
--- a/nestegg/0002-ne_read_simple-uninitialized_variable.diff
+++ b/third_party/nestegg/0002-ne_read_simple-uninitialized_variable.diff
diff --git a/nestegg/AUTHORS b/third_party/nestegg/AUTHORS
index 7d2c61265..7d2c61265 100644
--- a/nestegg/AUTHORS
+++ b/third_party/nestegg/AUTHORS
diff --git a/nestegg/INSTALL b/third_party/nestegg/INSTALL
index 401df4184..401df4184 100644
--- a/nestegg/INSTALL
+++ b/third_party/nestegg/INSTALL
diff --git a/nestegg/LICENSE b/third_party/nestegg/LICENSE
index a67984a61..a67984a61 100644
--- a/nestegg/LICENSE
+++ b/third_party/nestegg/LICENSE
diff --git a/nestegg/README b/third_party/nestegg/README
index 47c8237d2..47c8237d2 100644
--- a/nestegg/README
+++ b/third_party/nestegg/README
diff --git a/nestegg/README.webm b/third_party/nestegg/README.webm
index c93116882..c93116882 100644
--- a/nestegg/README.webm
+++ b/third_party/nestegg/README.webm
diff --git a/nestegg/TODO b/third_party/nestegg/TODO
index bf0cb04c4..bf0cb04c4 100644
--- a/nestegg/TODO
+++ b/third_party/nestegg/TODO
diff --git a/nestegg/halloc/README b/third_party/nestegg/halloc/README
index 380fba2b8..380fba2b8 100644
--- a/nestegg/halloc/README
+++ b/third_party/nestegg/halloc/README
diff --git a/nestegg/halloc/halloc.h b/third_party/nestegg/halloc/halloc.h
index 10af4e8d8..10af4e8d8 100644
--- a/nestegg/halloc/halloc.h
+++ b/third_party/nestegg/halloc/halloc.h
diff --git a/nestegg/halloc/src/align.h b/third_party/nestegg/halloc/src/align.h
index 4c6e1831f..4c6e1831f 100644
--- a/nestegg/halloc/src/align.h
+++ b/third_party/nestegg/halloc/src/align.h
diff --git a/nestegg/halloc/src/halloc.c b/third_party/nestegg/halloc/src/halloc.c
index 837b3ff01..8860d736a 100644
--- a/nestegg/halloc/src/halloc.c
+++ b/third_party/nestegg/halloc/src/halloc.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>  /* realloc */
 #include <string.h>  /* memset & co */
 
-#include "nestegg/halloc/halloc.h"
+#include "third_party/nestegg/halloc/halloc.h"
 #include "align.h"
 #include "hlist.h"
 
diff --git a/nestegg/halloc/src/hlist.h b/third_party/nestegg/halloc/src/hlist.h
index 2791f78c7..2791f78c7 100644
--- a/nestegg/halloc/src/hlist.h
+++ b/third_party/nestegg/halloc/src/hlist.h
diff --git a/nestegg/halloc/src/macros.h b/third_party/nestegg/halloc/src/macros.h
index 1f84bc277..1f84bc277 100644
--- a/nestegg/halloc/src/macros.h
+++ b/third_party/nestegg/halloc/src/macros.h
diff --git a/nestegg/include/nestegg/nestegg.h b/third_party/nestegg/include/nestegg/nestegg.h
index c18d1d3bf..c18d1d3bf 100644
--- a/nestegg/include/nestegg/nestegg.h
+++ b/third_party/nestegg/include/nestegg/nestegg.h
diff --git a/nestegg/src/nestegg.c b/third_party/nestegg/src/nestegg.c
index b6bc46093..30e0e2bb8 100644
--- a/nestegg/src/nestegg.c
+++ b/third_party/nestegg/src/nestegg.c
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "nestegg/halloc/halloc.h"
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/halloc/halloc.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 
 /* EBML Elements */
 #define ID_EBML                 0x1a45dfa3
diff --git a/nestegg/test/test.c b/third_party/nestegg/test/test.c
index cc0753de7..cc0753de7 100644
--- a/nestegg/test/test.c
+++ b/third_party/nestegg/test/test.c
diff --git a/tools_common.c b/tools_common.c
index f0e160697..4f2ac7401 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -8,13 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "tools_common.h"
-
+#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "./tools_common.h"
+
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
@@ -77,26 +78,6 @@ void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   exit(EXIT_FAILURE);
 }
 
-uint16_t mem_get_le16(const void *data) {
-  uint16_t val;
-  const uint8_t *mem = (const uint8_t*)data;
-
-  val = mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
-uint32_t mem_get_le32(const void *data) {
-  uint32_t val;
-  const uint8_t *mem = (const uint8_t*)data;
-
-  val = mem[3] << 24;
-  val |= mem[2] << 16;
-  val |= mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
   FILE *f = input_ctx->file;
   struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
@@ -273,3 +254,14 @@ int vpx_img_read(vpx_image_t *img, FILE *file) {
   return 1;
 }
 
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+  static const double kMaxPSNR = 100.0;
+
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+  } else {
+    return kMaxPSNR;
+  }
+}
diff --git a/tools_common.h b/tools_common.h
index 2e9025915..58894def0 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -118,9 +118,6 @@ void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 /* The tool including this file must define usage_exit() */
 void usage_exit();
 
-uint16_t mem_get_le16(const void *data);
-uint32_t mem_get_le32(const void *data);
-
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
@@ -145,6 +142,8 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane);
 void vpx_img_write(const vpx_image_t *img, FILE *file);
 int vpx_img_read(vpx_image_t *img, FILE *file);
 
+double sse_to_psnr(double samples, double peak, double mse);
+
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
diff --git a/video_reader.c b/video_reader.c
index 4be7483d3..39c7edba1 100644
--- a/video_reader.c
+++ b/video_reader.c
@@ -14,6 +14,8 @@
 #include "./ivfdec.h"
 #include "./video_reader.h"
 
+#include "vpx_ports/mem_ops.h"
+
 static const char *const kIVFSignature = "DKIF";
 
 struct VpxVideoReaderStruct {
diff --git a/vp8/common/arm/neon/copymem16x16_neon.asm b/vp8/common/arm/neon/copymem16x16_neon.asm
deleted file mode 100644
index bda4b9654..000000000
--- a/vp8/common/arm/neon/copymem16x16_neon.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem16x16_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem16x16_neon| PROC
-
-    vld1.u8     {q0}, [r0], r1
-    vld1.u8     {q1}, [r0], r1
-    vld1.u8     {q2}, [r0], r1
-    vst1.u8     {q0}, [r2], r3
-    vld1.u8     {q3}, [r0], r1
-    vst1.u8     {q1}, [r2], r3
-    vld1.u8     {q4}, [r0], r1
-    vst1.u8     {q2}, [r2], r3
-    vld1.u8     {q5}, [r0], r1
-    vst1.u8     {q3}, [r2], r3
-    vld1.u8     {q6}, [r0], r1
-    vst1.u8     {q4}, [r2], r3
-    vld1.u8     {q7}, [r0], r1
-    vst1.u8     {q5}, [r2], r3
-    vld1.u8     {q8}, [r0], r1
-    vst1.u8     {q6}, [r2], r3
-    vld1.u8     {q9}, [r0], r1
-    vst1.u8     {q7}, [r2], r3
-    vld1.u8     {q10}, [r0], r1
-    vst1.u8     {q8}, [r2], r3
-    vld1.u8     {q11}, [r0], r1
-    vst1.u8     {q9}, [r2], r3
-    vld1.u8     {q12}, [r0], r1
-    vst1.u8     {q10}, [r2], r3
-    vld1.u8     {q13}, [r0], r1
-    vst1.u8     {q11}, [r2], r3
-    vld1.u8     {q14}, [r0], r1
-    vst1.u8     {q12}, [r2], r3
-    vld1.u8     {q15}, [r0], r1
-    vst1.u8     {q13}, [r2], r3
-    vst1.u8     {q14}, [r2], r3
-    vst1.u8     {q15}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem16x16_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem8x4_neon.asm b/vp8/common/arm/neon/copymem8x4_neon.asm
deleted file mode 100644
index 35c0f6708..000000000
--- a/vp8/common/arm/neon/copymem8x4_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem8x4_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x4_neon| PROC
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vst1.u8     {d3}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem8x4_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem8x8_neon.asm b/vp8/common/arm/neon/copymem8x8_neon.asm
deleted file mode 100644
index 1f5b9411b..000000000
--- a/vp8/common/arm/neon/copymem8x8_neon.asm
+++ /dev/null
@@ -1,43 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_copy_mem8x8_neon|
-    ; ARM
-    ; REQUIRE8
-    ; PRESERVE8
-
-    AREA    Block, CODE, READONLY ; name this block of code
-;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_copy_mem8x8_neon| PROC
-
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d1}, [r0], r1
-    vst1.u8     {d0}, [r2], r3
-    vld1.u8     {d2}, [r0], r1
-    vst1.u8     {d1}, [r2], r3
-    vld1.u8     {d3}, [r0], r1
-    vst1.u8     {d2}, [r2], r3
-    vld1.u8     {d4}, [r0], r1
-    vst1.u8     {d3}, [r2], r3
-    vld1.u8     {d5}, [r0], r1
-    vst1.u8     {d4}, [r2], r3
-    vld1.u8     {d6}, [r0], r1
-    vst1.u8     {d5}, [r2], r3
-    vld1.u8     {d7}, [r0], r1
-    vst1.u8     {d6}, [r2], r3
-    vst1.u8     {d7}, [r2], r3
-
-    mov     pc, lr
-
-    ENDP  ; |vp8_copy_mem8x8_neon|
-
-    END
diff --git a/vp8/common/arm/neon/copymem_neon.c b/vp8/common/arm/neon/copymem_neon.c
new file mode 100644
index 000000000..deced115c
--- /dev/null
+++ b/vp8/common/arm/neon/copymem_neon.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_copy_mem8x4_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 4; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem8x8_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    uint8x8_t vtmp;
+    int r;
+
+    for (r = 0; r < 8; r++) {
+        vtmp = vld1_u8(src);
+        vst1_u8(dst, vtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_copy_mem16x16_neon(
+        unsigned char *src,
+        int src_stride,
+        unsigned char *dst,
+        int dst_stride) {
+    int r;
+    uint8x16_t qtmp;
+
+    for (r = 0; r < 16; r++) {
+        qtmp = vld1q_u8(src);
+        vst1q_u8(dst, qtmp);
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
deleted file mode 100644
index 79ff02c69..000000000
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,54 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-;                            int pred_stride, unsigned char *dst_ptr,
-;                            int dst_stride)
-
-; r0  input_dc
-; r1  pred_ptr
-; r2  pred_stride
-; r3  dst_ptr
-; sp  dst_stride
-
-|vp8_dc_only_idct_add_neon| PROC
-    add             r0, r0, #4
-    asr             r0, r0, #3
-    ldr             r12, [sp]
-    vdup.16         q0, r0
-
-    vld1.32         {d2[0]}, [r1], r2
-    vld1.32         {d2[1]}, [r1], r2
-    vld1.32         {d4[0]}, [r1], r2
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r3], r12
-    vst1.32         {d2[1]}, [r3], r12
-    vst1.32         {d4[0]}, [r3], r12
-    vst1.32         {d4[1]}, [r3]
-
-    bx              lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.c b/vp8/common/arm/neon/dc_only_idct_add_neon.c
new file mode 100644
index 000000000..ad5f41d7d
--- /dev/null
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.c
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dc_only_idct_add_neon(
+        int16_t input_dc,
+        unsigned char *pred_ptr,
+        int pred_stride,
+        unsigned char *dst_ptr,
+        int dst_stride) {
+    int i;
+    uint16_t a1 = ((input_dc + 4) >> 3);
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint8x8_t d2u8;
+    uint16x8_t q1u16;
+    uint16x8_t qAdd;
+
+    qAdd = vdupq_n_u16(a1);
+
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
+        pred_ptr += pred_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
+        pred_ptr += pred_stride;
+
+        q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
+        dst_ptr += dst_stride;
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
+        dst_ptr += dst_stride;
+    }
+}
diff --git a/vp8/common/arm/neon/dequant_idct_neon.asm b/vp8/common/arm/neon/dequant_idct_neon.asm
deleted file mode 100644
index 602cce676..000000000
--- a/vp8/common/arm/neon/dequant_idct_neon.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequant_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_add_neon(short *input, short *dq,
-;                           unsigned char *dest, int stride)
-; r0    short *input,
-; r1    short *dq,
-; r2    unsigned char *dest
-; r3    int stride
-
-|vp8_dequant_idct_add_neon| PROC
-    vld1.16         {q3, q4}, [r0]
-    vld1.16         {q5, q6}, [r1]
-
-    add             r1, r2, r3              ; r1 = dest + stride
-    lsl             r3, #1                  ; 2x stride
-
-    vld1.32         {d14[0]}, [r2], r3
-    vld1.32         {d14[1]}, [r1], r3
-    vld1.32         {d15[0]}, [r2]
-    vld1.32         {d15[1]}, [r1]
-
-    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
-
-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
-    vmul.i16        q2, q4, q6
-
-;|short_idct4x4llm_neon| PROC
-    vld1.16         {d0}, [r12]
-    vswp            d3, d4                  ;q2(vp[4] vp[12])
-
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-; memset(input, 0, 32) -- 32bytes
-    vmov.i16        q14, #0
-
-    vswp            d3, d4
-    vqdmulh.s16     q3, q2, d0[2]
-    vqdmulh.s16     q4, q2, d0[0]
-
-    vqadd.s16       d12, d2, d3             ;a1
-    vqsub.s16       d13, d2, d3             ;b1
-
-    vmov            q15, q14
-
-    vshr.s16        q3, q3, #1
-    vshr.s16        q4, q4, #1
-
-    vqadd.s16       q3, q3, q2
-    vqadd.s16       q4, q4, q2
-
-    vqsub.s16       d10, d6, d9             ;c1
-    vqadd.s16       d11, d7, d8             ;d1
-
-    vqadd.s16       d2, d12, d11
-    vqadd.s16       d3, d13, d10
-    vqsub.s16       d4, d13, d10
-    vqsub.s16       d5, d12, d11
-
-    vst1.16         {q14, q15}, [r0]
-
-    vrshr.s16       d2, d2, #3
-    vrshr.s16       d3, d3, #3
-    vrshr.s16       d4, d4, #3
-    vrshr.s16       d5, d5, #3
-
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
-
-    vaddw.u8        q1, q1, d14
-    vaddw.u8        q2, q2, d15
-
-    sub             r2, r2, r3
-    sub             r1, r1, r3
-
-    vqmovun.s16     d0, q1
-    vqmovun.s16     d1, q2
-
-    vst1.32         {d0[0]}, [r2], r3
-    vst1.32         {d0[1]}, [r1], r3
-    vst1.32         {d1[0]}, [r2]
-    vst1.32         {d1[1]}, [r1]
-
-    bx             lr
-
-    ENDP           ; |vp8_dequant_idct_add_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b4e7b
-sinpi8sqrt2       DCD 0x8a8c8a8c
-
-    END
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c
new file mode 100644
index 000000000..58e11922c
--- /dev/null
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -0,0 +1,142 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2       = 35468;
+
+void vp8_dequant_idct_add_neon(
+        int16_t *input,
+        int16_t *dq,
+        unsigned char *dst,
+        int stride) {
+    unsigned char *dst0;
+    int32x2_t d14, d15;
+    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+    int16x8_t q1, q2, q3, q4, q5, q6;
+    int16x8_t qEmpty = vdupq_n_s16(0);
+    int32x2x2_t d2tmp0, d2tmp1;
+    int16x4x2_t d2tmp2, d2tmp3;
+
+    d14 = d15 = vdup_n_s32(0);
+
+    // load input
+    q3 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+    input += 8;
+    q4 = vld1q_s16(input);
+    vst1q_s16(input, qEmpty);
+
+    // load dq
+    q5 = vld1q_s16(dq);
+    dq += 8;
+    q6 = vld1q_s16(dq);
+
+    // load src from dst
+    dst0 = dst;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+
+    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
+                                         vreinterpretq_u16_s16(q5)));
+    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
+                                         vreinterpretq_u16_s16(q6)));
+
+    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+
+    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    // loop 2
+    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+
+    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+
+    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+
+    q3 = vshrq_n_s16(q3, 1);
+    q4 = vshrq_n_s16(q4, 1);
+
+    q3 = vqaddq_s16(q3, q2);
+    q4 = vqaddq_s16(q4, q2);
+
+    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+
+    d2 = vqadd_s16(d12, d11);
+    d3 = vqadd_s16(d13, d10);
+    d4 = vqsub_s16(d13, d10);
+    d5 = vqsub_s16(d12, d11);
+
+    d2 = vrshr_n_s16(d2, 3);
+    d3 = vrshr_n_s16(d3, 3);
+    d4 = vrshr_n_s16(d4, 3);
+    d5 = vrshr_n_s16(d5, 3);
+
+    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                      vreinterpret_s16_s32(d2tmp1.val[0]));
+    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                      vreinterpret_s16_s32(d2tmp1.val[1]));
+
+    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+
+    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
+                                        vreinterpret_u8_s32(d14)));
+    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
+                                        vreinterpret_u8_s32(d15)));
+
+    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d14, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d14, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d15, 1);
+    return;
+}
diff --git a/vp8/common/arm/neon/dequantizeb_neon.asm b/vp8/common/arm/neon/dequantizeb_neon.asm
deleted file mode 100644
index c8e0c31f2..000000000
--- a/vp8/common/arm/neon/dequantizeb_neon.asm
+++ /dev/null
@@ -1,34 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_dequantize_b_loop_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0    short *Q,
-; r1    short *DQC
-; r2    short *DQ
-|vp8_dequantize_b_loop_neon| PROC
-    vld1.16         {q0, q1}, [r0]
-    vld1.16         {q2, q3}, [r1]
-
-    vmul.i16        q4, q0, q2
-    vmul.i16        q5, q1, q3
-
-    vst1.16         {q4, q5}, [r2]
-
-    bx             lr
-
-    ENDP
-
-    END
diff --git a/vp8/common/arm/neon/dequantizeb_neon.c b/vp8/common/arm/neon/dequantizeb_neon.c
new file mode 100644
index 000000000..60f69c8db
--- /dev/null
+++ b/vp8/common/arm/neon/dequantizeb_neon.c
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_dequantize_b_loop_neon(
+        int16_t *Q,
+        int16_t *DQC,
+        int16_t *DQ) {
+    int16x8x2_t qQ, qDQC, qDQ;
+
+    qQ   = vld2q_s16(Q);
+    qDQC = vld2q_s16(DQC);
+
+    qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
+    qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
+
+    vst2q_s16(DQ, qDQ);
+    return;
+}
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7f3964665..849a0ed2a 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -19,7 +19,7 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "psnr.h"
+#include "vpx/internal/vpx_psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
@@ -2170,10 +2170,12 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                                               8.0 / 1000.0  / time_encoded;
                         double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
                                          lst_yv12->y_width * lst_yv12->y_height;
-                        double total_psnr = vp8_mse2psnr(samples, 255.0,
-                                                  cpi->total_error2[i]);
-                        double total_psnr2 = vp8_mse2psnr(samples, 255.0,
-                                                  cpi->total_error2_p[i]);
+                        double total_psnr =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2[i]);
+                        double total_psnr2 =
+                            vpx_sse_to_psnr(samples, 255.0,
+                                            cpi->total_error2_p[i]);
                         double total_ssim = 100 * pow(cpi->sum_ssim[i] /
                                                       cpi->sum_weights[i], 8.0);
 
@@ -2190,9 +2192,9 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                 {
                     double samples = 3.0 / 2 * cpi->count *
                                         lst_yv12->y_width * lst_yv12->y_height;
-                    double total_psnr = vp8_mse2psnr(samples, 255.0,
-                                                         cpi->total_sq_error);
-                    double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+                    double total_psnr = vpx_sse_to_psnr(samples, 255.0,
+                                                        cpi->total_sq_error);
+                    double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
                                                          cpi->total_sq_error2);
                     double total_ssim = 100 * pow(cpi->summed_quality /
                                                       cpi->summed_weights, 8.0);
@@ -2522,8 +2524,8 @@ static void generate_psnr_packet(VP8_COMP *cpi)
     pkt.data.psnr.samples[3] = width * height;
 
     for (i = 0; i < 4; i++)
-        pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                             (double)(pkt.data.psnr.sse[i]));
+        pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0,
+                                                (double)(pkt.data.psnr.sse[i]));
 
     vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
@@ -5284,11 +5286,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
                 sq_error = (double)(ye + ue + ve);
 
-                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+                frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error);
 
-                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
-                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
-                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
+                cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye);
+                cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue);
+                cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
 #if CONFIG_POSTPROC
@@ -5311,14 +5313,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
                     sq_error2 = (double)(ye + ue + ve);
 
-                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
+                    frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2);
 
-                    cpi->totalp_y += vp8_mse2psnr(y_samples,
-                                                  255.0, (double)ye);
-                    cpi->totalp_u += vp8_mse2psnr(uv_samples,
-                                                  255.0, (double)ue);
-                    cpi->totalp_v += vp8_mse2psnr(uv_samples,
-                                                  255.0, (double)ve);
+                    cpi->totalp_y += vpx_sse_to_psnr(y_samples,
+                                                     255.0, (double)ye);
+                    cpi->totalp_u += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ue);
+                    cpi->totalp_v += vpx_sse_to_psnr(uv_samples,
+                                                     255.0, (double)ve);
                     cpi->total_sq_error2 += sq_error2;
                     cpi->totalp  += frame_psnr2;
 
diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
deleted file mode 100644
index b3a3d9552..000000000
--- a/vp8/encoder/psnr.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/yv12config.h"
-#include "math.h"
-#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
-
-#define MAX_PSNR 100
-
-double vp8_mse2psnr(double Samples, double Peak, double Mse)
-{
-    double psnr;
-
-    if ((double)Mse > 0.0)
-        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-    else
-        psnr = MAX_PSNR;      /* Limit to prevent / 0 */
-
-    if (psnr > MAX_PSNR)
-        psnr = MAX_PSNR;
-
-    return psnr;
-}
diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h
deleted file mode 100644
index 0c6c088c4..000000000
--- a/vp8/encoder/psnr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_ENCODER_PSNR_H_
-#define VP8_ENCODER_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_ENCODER_PSNR_H_
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 7e3af71ec..513b2bfea 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -16,7 +16,6 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index a2127c9b7..ac91d7af5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -159,10 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x4_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
@@ -177,10 +173,8 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/save_reg_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
@@ -189,6 +183,10 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index ce789e218..4c896b1d1 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1265,6 +1265,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         "vp8.fpf"           /* first pass filename */
 #endif
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+        {0},                /* ss_target_bitrate */
         1,                  /* ts_number_layers */
         {0},                /* ts_target_bitrate */
         {0},                /* ts_rate_decimator */
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index cd091f39a..d7c6dd1e1 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -50,7 +50,6 @@ VP8_CX_SRCS-yes += encoder/mcomp.h
 VP8_CX_SRCS-yes += encoder/modecosts.h
 VP8_CX_SRCS-yes += encoder/onyx_int.h
 VP8_CX_SRCS-yes += encoder/pickinter.h
-VP8_CX_SRCS-yes += encoder/psnr.h
 VP8_CX_SRCS-yes += encoder/quantize.h
 VP8_CX_SRCS-yes += encoder/ratectrl.h
 VP8_CX_SRCS-yes += encoder/rdopt.h
@@ -61,7 +60,6 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
-VP8_CX_SRCS-yes += encoder/psnr.c
 VP8_CX_SRCS-yes += encoder/quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
index 93d3af301..b41f5661b 100644
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -576,6 +576,7 @@
     vld1.s16        {q14,q15}, [r0]!
 
     push            {r0-r10}
+    vpush           {d8-d15}
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -636,6 +637,7 @@ iadst_iadst
     IADST8X8_1D
 
 end_vp9_iht8x8_64_add_neon
+    vpop           {d8-d15}
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
deleted file mode 100644
index 536febb65..000000000
--- a/vp9/common/generic/vp9_systemdependent.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-void vp9_machine_specific_config(VP9_COMMON *cm) {
-  (void)cm;
-  vp9_rtcd();
-}
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 6f771992b..a72821b34 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -200,10 +200,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   return 1;
 }
 
-void vp9_create_common(VP9_COMMON *cm) {
-  vp9_machine_specific_config(cm);
-}
-
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
   vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index e3b5b95d8..066c778c3 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -22,7 +22,6 @@ void vp9_initialize_common();
 
 void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
 
-void vp9_create_common(VP9_COMMON *cm);
 void vp9_remove_common(VP9_COMMON *cm);
 
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index d918bedc6..e1d1318ac 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -16,8 +16,7 @@ MB_PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi,
     if (!left_mi || is_inter_block(&left_mi->mbmi))
       return DC_PRED;
 
-    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                             : left_mi->mbmi.mode;
+    return get_y_mode(left_mi, b + 1);
   } else {
     assert(b == 1 || b == 3);
     return cur_mi->bmi[b - 1].as_mode;
@@ -30,8 +29,7 @@ MB_PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi,
     if (!above_mi || is_inter_block(&above_mi->mbmi))
       return DC_PRED;
 
-    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
-                                              : above_mi->mbmi.mode;
+    return get_y_mode(above_mi, b + 2);
   } else {
     assert(b == 2 || b == 3);
     return cur_mi->bmi[b - 2].as_mode;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6086323f6..2a0ebfba0 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -144,6 +144,11 @@ typedef struct {
   b_mode_info bmi[4];
 } MODE_INFO;
 
+static INLINE MB_PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) {
+  return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode
+                                      : mi->mbmi.mode;
+}
+
 static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[0] > INTRA_FRAME;
 }
@@ -255,13 +260,11 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
     return DCT_DCT;
 
-  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? mi->bmi[ib].as_mode
-                                                 : mbmi->mode];
+  return mode2txfm_map[get_y_mode(mi, ib)];
 }
 
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
@@ -328,13 +331,6 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
-
-static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
-                             TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index aab8b5388..bd5086aa1 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -177,13 +177,11 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
 static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                          PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (is_inter_block(mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
     return &vp9_default_scan_orders[tx_size];
   } else {
-    const MB_PREDICTION_MODE mode =
-        mbmi->sb_type < BLOCK_8X8 ? mi->bmi[block_idx].as_mode : mbmi->mode;
+    const MB_PREDICTION_MODE mode = get_y_mode(mi, block_idx);
     return &vp9_scan_orders[tx_size][mode2txfm_map[mode]];
   }
 }
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index fe2056d14..868a66ae4 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -262,9 +262,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
     int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
-      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
-                  ? data
-                  : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
+      lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ?
+                      data : default_filt_lvl + data,
+                      0, MAX_LOOP_FILTER);
     }
 
     if (!lf->mode_ref_delta_enabled) {
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 3b0d1e982..e5f3fed45 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -186,13 +186,12 @@ static INLINE int is_inside(const TileInfo *const tile,
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col) {
+static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             const TileInfo *const tile,
+                             MODE_INFO *mi, const MODE_INFO *prev_mi,
+                             MV_REFERENCE_FRAME ref_frame,
+                             int_mv *mv_ref_list,
+                             int block_idx, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
@@ -291,6 +290,16 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
 }
 
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                    const TileInfo *const tile,
+                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
+                                    MV_REFERENCE_FRAME ref_frame,
+                                    int_mv *mv_ref_list,
+                                    int mi_row, int mi_col) {
+  find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col);
+}
+
 static void lower_mv_precision(MV *mv, int allow_hp) {
   const int use_hp = allow_hp && vp9_use_mv_hp(mv);
   if (!use_hp) {
@@ -325,8 +334,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
-                       mv_list, block, mi_row, mi_col);
+  find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
+                   mv_list, block, mi_row, mi_col);
 
   near->as_int = 0;
   switch (block) {
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index f99952f3c..04cb000ef 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -17,29 +17,24 @@
 extern "C" {
 #endif
 
+#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\
+                                VP9_INTERP_EXTEND) << 3)
 
-void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                          const TileInfo *const tile,
-                          MODE_INFO *mi, const MODE_INFO *prev_mi,
-                          MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list,
-                          int block_idx,
-                          int mi_row, int mi_col);
-
-static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                    const TileInfo *const tile,
-                                    MODE_INFO *mi, const MODE_INFO *prev_mi,
-                                    MV_REFERENCE_FRAME ref_frame,
-                                    int_mv *mv_ref_list,
-                                    int mi_row, int mi_col) {
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
-                       mv_ref_list, -1, mi_row, mi_col);
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
-#define LEFT_TOP_MARGIN     ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
-#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS  \
-                            - VP9_INTERP_EXTEND) << 3)
+void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      const TileInfo *const tile,
+                      MODE_INFO *mi, const MODE_INFO *prev_mi,
+                      MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int mi_row, int mi_col);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -47,14 +42,6 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
                            int_mv *mvlist, int_mv *nearest, int_mv *near);
 
-// TODO(jingning): this mv clamping function should be block size dependent.
-static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
-  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-}
-
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index ab27ca523..222086886 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -149,6 +149,8 @@ extern "C" {
     // Spatial and temporal scalability.
     int ss_number_layers;  // Number of spatial layers.
     int ts_number_layers;  // Number of temporal layers.
+    // Bitrate allocation for spatial layers.
+    int ss_target_bitrate[VPX_SS_MAX_LAYERS];
     // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
     int ts_target_bitrate[VPX_TS_MAX_LAYERS];
     int ts_rate_decimator[VPX_TS_MAX_LAYERS];
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index 9fef8b1ef..def12554d 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -134,9 +134,9 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex) {
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
     const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
-    return seg->abs_delta == SEGMENT_ABSDATA ?
-                             data :  // Abs value
-                             clamp(base_qindex + data, 0, MAXQ);  // Delta value
+    const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ?
+        data : base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
   } else {
     return base_qindex;
   }
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index df603ad70..bdcfafa4e 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -244,7 +244,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 // TODO(jingning): This function serves as a placeholder for decoder prediction
 // using on demand border extension. It should be moved to /decoder/ directory.
 static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
                                        int x, int y, int w, int h,
                                        int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -378,10 +377,10 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
+          dec_build_inter_predictors(xd, plane, i++,
                                      4 * x, 4 * y, 4, 4, mi_x, mi_y);
     } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
+      dec_build_inter_predictors(xd, plane, 0,
                                  0, 0, bw, bh, mi_x, mi_y);
     }
   }
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 96ba3e464..71a41a9de 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -382,34 +382,34 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
       /* slower path if the block needs border extension */
       if (x0 + 2 * bs <= frame_width) {
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, 2 * bs + 1);
+          vpx_memcpy(above_row, above_ref, 2 * bs);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 + bs <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, bs + 1);
+          vpx_memcpy(above_row, above_ref, bs);
           vpx_memset(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row - 1, above_ref - 1, r + 1);
+          vpx_memcpy(above_row, above_ref, r);
           vpx_memset(above_row + r, above_row[r - 1],
                      x0 + 2 * bs - frame_width);
         }
-        above_row[-1] = left_available ? above_ref[-1] : 129;
       }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
     } else {
       /* faster path if the block does not need extension */
       if (bs == 4 && right_available && left_available) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index bd9571e0a..5b449705a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -683,7 +683,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
+prototype int64_t vp9_block_error "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
 specialize vp9_block_error $sse2_x86inc
 
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
@@ -737,7 +737,7 @@ specialize vp9_fdct32x32_rd sse2 avx2
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
+prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index 7455abce3..72edbca55 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -76,9 +76,6 @@ static INLINE int get_msb(unsigned int n) {
 }
 #endif
 
-struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *cm);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 791d0f2e7..8bebca51a 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -15,6 +15,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -39,20 +40,16 @@
 #include "vp9/decoder/vp9_reader.h"
 #include "vp9/decoder/vp9_thread.h"
 
-static int read_be32(const uint8_t *p) {
-  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
-}
-
 static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
   for (i = 1; i < REFS_PER_FRAME; ++i)
-    if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
       return 1;
 
   return 0;
 }
 
-static void setup_compound_reference(VP9_COMMON *cm) {
+static void setup_compound_reference_mode(VP9_COMMON *cm) {
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -116,33 +113,34 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
       vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
+                                                vp9_reader *r) {
   if (is_compound_reference_allowed(cm)) {
-    REFERENCE_MODE mode = vp9_read_bit(r);
-    if (mode)
-      mode += vp9_read_bit(r);
-    setup_compound_reference(cm);
-    return mode;
+    return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT
+                                              : COMPOUND_REFERENCE)
+                           : SINGLE_REFERENCE;
   } else {
     return SINGLE_REFERENCE;
   }
 }
 
-static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
+  FRAME_CONTEXT *const fc = &cm->fc;
   int i;
+
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
-    for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_inter_prob[i]);
 
   if (cm->reference_mode != COMPOUND_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++) {
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
-      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]);
     }
 
   if (cm->reference_mode != SINGLE_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; i++)
-      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
+    for (i = 0; i < REF_CONTEXTS; ++i)
+      vp9_diff_update_prob(r, &fc->comp_ref_prob[i]);
 }
 
 static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
@@ -289,10 +287,8 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
   MACROBLOCKD *const xd = args->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
-  const MB_PREDICTION_MODE mode = (plane == 0)
-          ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode
-                                            : mi->mbmi.mode)
-          : mi->mbmi.uv_mode;
+  const MB_PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
+                                               : mi->mbmi.uv_mode;
   int x, y;
   uint8_t *dst;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
@@ -836,7 +832,7 @@ static size_t get_tile(const uint8_t *const data_end,
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    size = read_be32(*data);
+    size = mem_get_be32(*data);
     *data += 4;
 
     if (size > (size_t)(data_end - *data))
@@ -1091,7 +1087,7 @@ static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   }
 }
 
-static void error_handler(void *data, size_t bit_offset) {
+static void error_handler(void *data) {
   VP9_COMMON *const cm = (VP9_COMMON *)data;
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
@@ -1120,6 +1116,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+
+    if (cm->frame_bufs[frame_to_show].ref_count < 1)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a decoded frame",
+                         frame_to_show);
+
     ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
@@ -1196,8 +1198,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
                                           ref_buf->buf->y_crop_height,
                                           cm->width, cm->height);
         if (vp9_is_scaled(&ref_buf->sf))
-          vp9_extend_frame_borders(ref_buf->buf,
-                                   cm->subsampling_x, cm->subsampling_y);
+          vp9_extend_frame_borders(ref_buf->buf);
       }
     }
   }
@@ -1265,8 +1266,10 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-    cm->reference_mode = read_reference_mode(cm, &r);
-    read_reference_mode_probs(cm, &r);
+    cm->reference_mode = read_frame_reference_mode(cm, &r);
+    if (cm->reference_mode != SINGLE_REFERENCE)
+      setup_compound_reference_mode(cm);
+    read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 4de8db37c..0fb7a1580 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -257,13 +257,18 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
-static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                                          vp9_reader *r) {
-  const int ctx = vp9_get_reference_mode_context(cm, xd);
-  const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.comp_inter[ctx][mode];
-  return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                vp9_reader *r) {
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = vp9_get_reference_mode_context(cm, xd);
+    const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+    if (!cm->frame_parallel_decoding_mode)
+      ++cm->counts.comp_inter[ctx][mode];
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    return cm->reference_mode;
+  }
 }
 
 // Read the referncence frame
@@ -277,10 +282,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const REFERENCE_MODE mode = (cm->reference_mode == REFERENCE_MODE_SELECT)
-                                      ? read_reference_mode(cm, xd, r)
-                                      : cm->reference_mode;
-
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 1d3522e13..5ebfcee33 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -138,7 +138,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   cm->error.setjmp = 1;
   vp9_initialize_dec();
 
-  vp9_create_common(cm);
+  vp9_rtcd();
 
   pbi->oxcf = *oxcf;
   pbi->ready_for_new_data = 1;
diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
index 619e39f1e..8cb424703 100644
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-typedef void (*vp9_rb_error_handler)(void *data, size_t bit_offset);
+typedef void (*vp9_rb_error_handler)(void *data);
 
 struct vp9_read_bit_buffer {
   const uint8_t *bit_buffer;
@@ -39,7 +39,7 @@ static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
   const size_t p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
   if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data, rb->bit_offset);
+    rb->error_handler(rb->error_handler_data);
     return 0;
   } else {
     const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 31ec069d0..0f1692df0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,20 +14,21 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
 
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
 #include "vp9/encoder/vp9_tokenize.h"
@@ -61,15 +62,8 @@ static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static INLINE void write_be32(uint8_t *p, int value) {
-  p[0] = value >> 24;
-  p[1] = value >> 16;
-  p[2] = value >> 8;
-  p[3] = value;
-}
-
-void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
-                             int data, int max) {
+static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+                                int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
@@ -88,7 +82,7 @@ static void prob_diff_update(const vp9_tree_index *tree,
     vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
+static void write_selected_tx_size(const VP9_COMP *cpi,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -115,15 +109,14 @@ static int write_skip(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
   }
 }
 
-void vp9_update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
   int k;
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     prob_diff_update(vp9_switchable_interp_tree,
@@ -131,9 +124,8 @@ static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
                      cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void pack_mb_tokens(vp9_writer* const w,
-                           TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop) {
+static void pack_mb_tokens(vp9_writer *w,
+                           TOKENEXTRA **tp, const TOKENEXTRA *stop) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
@@ -200,45 +192,40 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 }
 
 // This function encodes the reference frame
-static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
-  const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (!seg_ref_active) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_reference_mode_prob(cm, xd));
+      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-             (cm->reference_mode == SINGLE_REFERENCE));
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
-    if (mi->ref_frame[1] > INTRA_FRAME) {
-      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
+    if (is_compound) {
+      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
-      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(cm, xd));
-      if (mi->ref_frame[0] != LAST_FRAME)
-        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(cm, xd));
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+      }
     }
-  } else {
-    assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
-           mi->ref_frame[0]);
   }
-
-  // If using the prediction model we have nothing further to do because
-  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -247,7 +234,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
-  MB_MODE_INFO *const mi = &m->mbmi;
+  const MB_MODE_INFO *const mi = &m->mbmi;
   const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
   const MV_REFERENCE_FRAME ref1 = mi->ref_frame[1];
   const MB_PREDICTION_MODE mode = mi->mode;
@@ -280,7 +267,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(ref0 != INTRA_FRAME &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
+    write_selected_tx_size(cpi, mi->tx_size, bsize, bc);
   }
 
   if (ref0 == INTRA_FRAME) {
@@ -304,7 +291,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
-    encode_ref_frame(cpi, bc);
+    write_ref_frames(cpi, bc);
     mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
 
 #ifdef ENTROPY_STATS
@@ -382,7 +369,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   write_skip(cpi, segment_id, m, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
+    write_selected_tx_size(cpi, m->mbmi.tx_size, m->mbmi.sb_type, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8) {
     const MB_PREDICTION_MODE A = vp9_above_block_mode(m, above_mi, 0);
@@ -579,7 +566,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_frame_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -617,7 +604,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_frame_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -665,7 +652,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                   if (t == PIVOT_NODE)
                     s = vp9_prob_diff_update_savings_search_model(
                         frame_branch_ct[i][j][k][l][0],
-                        old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                        old_frame_coef_probs[i][j][k][l], &newp, upd);
                   else
                     s = vp9_prob_diff_update_savings_search(
                         frame_branch_ct[i][j][k][l][t],
@@ -707,7 +694,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   }
 }
 
-static void update_coef_probs(VP9_COMP* cpi, vp9_writer* w) {
+static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
@@ -830,10 +817,10 @@ static void encode_segmentation(VP9_COMP *cpi,
           const int data_max = vp9_seg_feature_data_max(j);
 
           if (vp9_is_segfeature_signed(j)) {
-            vp9_encode_unsigned_max(wb, abs(data), data_max);
+            encode_unsigned_max(wb, abs(data), data_max);
             vp9_wb_write_bit(wb, data < 0);
           } else {
-            vp9_encode_unsigned_max(wb, data, data_max);
+            encode_unsigned_max(wb, data, data_max);
           }
         }
       }
@@ -842,9 +829,7 @@ static void encode_segmentation(VP9_COMP *cpi,
 }
 
 
-static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
-  VP9_COMMON *const cm = &cpi->common;
-
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
   // Mode
   vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
@@ -1007,7 +992,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        write_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
         total_size += 4;
       }
 
@@ -1162,7 +1147,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
   else
-    encode_txfm_probs(cpi, &header_bc);
+    encode_txfm_probs(cm, &header_bc);
 
   update_coef_probs(cpi, &header_bc);
 
@@ -1170,7 +1155,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   active_section = 2;
 #endif
 
-  vp9_update_skip_probs(cm, &header_bc);
+  update_skip_probs(cm, &header_bc);
 
   if (!frame_is_intra_only(cm)) {
     int i;
@@ -1185,7 +1170,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     vp9_zero(cm->counts.inter_mode);
 
     if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cpi, &header_bc);
+      update_switchable_interp_probs(cm, &header_bc);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 94bec8a43..ddfd0ed4f 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -16,7 +16,11 @@
 extern "C" {
 #endif
 
-void vp9_update_skip_probs(VP9_COMMON *cm, vp9_writer *bc);
+struct VP9_COMP;
+
+void vp9_entropy_mode_init();
+
+void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 28942f7d6..b8dc72ab1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -94,7 +94,8 @@ static const uint8_t VP9_VAR_OFFS[64] = {
   128, 128, 128, 128, 128, 128, 128, 128
 };
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
+static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
+                                              MACROBLOCK *x,
                                               BLOCK_SIZE bs) {
   unsigned int var, sse;
   var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -102,6 +103,52 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
+static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
+                                                   MACROBLOCK *x,
+                                                   int mi_row,
+                                                   int mi_col,
+                                                   BLOCK_SIZE bs) {
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  int offset = (mi_row * MI_SIZE) * yv12->y_stride + (mi_col * MI_SIZE);
+  unsigned int var, sse;
+  var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                           x->plane[0].src.stride,
+                           yv12->y_buffer + offset,
+                           yv12->y_stride,
+                           &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                   int mi_row,
+                                                   int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
+                                                      int mi_row,
+                                                      int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb,
+                                                    mi_row, mi_col,
+                                                    BLOCK_64X64);
+  if (var < 4)
+    return BLOCK_64X64;
+  else if (var < 10)
+    return BLOCK_32X32;
+  else
+    return BLOCK_16X16;
+}
+
 // Original activity measure from Tim T's code.
 static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int sse;
@@ -441,7 +488,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
     if ((cpi->oxcf.aq_mode == VARIANCE_AQ) ||
         (cpi->oxcf.aq_mode == COMPLEXITY_AQ)) {
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
   }
 
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
@@ -492,10 +539,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
@@ -588,7 +635,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                                                  : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
 
     if (seg->enabled && cpi->seg0_cnt > 0 &&
         !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME) &&
@@ -676,7 +723,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
     }
 
     rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
-    vp9_mb_init_quantizer(cpi, x);
+    vp9_init_plane_quantizers(cpi, x);
   }
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
@@ -948,7 +995,7 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
   if (rows_left <= 0 || cols_left <= 0) {
     return MIN(bsize, BLOCK_8X8);
   } else {
-    for (; bsize > 0; --bsize) {
+    for (; bsize > 0; bsize -= 3) {
       *bh = num_8x8_blocks_high_lookup[bsize];
       *bw = num_8x8_blocks_wide_lookup[bsize];
       if ((*bh <= rows_left) && (*bw <= cols_left)) {
@@ -965,9 +1012,9 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
-                             MODE_INFO **mi_8x8, int mi_row, int mi_col) {
+                             MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
   const int mis = cm->mode_info_stride;
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -994,7 +1041,7 @@ static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
         int index = block_row * mis + block_col;
         // Find a partition size that fits
-        bsize = find_partition_size(cpi->sf.always_this_block_size,
+        bsize = find_partition_size(bsize,
                                     (row8x8_remaining - block_row),
                                     (col8x8_remaining - block_col), &bh, &bw);
         mi_8x8[index] = mi_upper_left + index;
@@ -1041,8 +1088,7 @@ static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8) {
   return 0;
 }
 
-static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE bsize, int output_enabled) {
+static void update_state_rt(VP9_COMP *cpi, const PICK_MODE_CONTEXT *ctx) {
   int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -1074,15 +1120,15 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = vp9_get_pred_context_switchable_interp(xd);
-        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+        const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
       }
     }
   }
@@ -1100,7 +1146,7 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
       return;
   }
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state_rt(cpi, get_block_context(x, bsize), bsize, output_enabled);
+  update_state_rt(cpi, get_block_context(x, bsize));
 
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
   update_stats(cpi);
@@ -1179,7 +1225,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
                    subsize);
       *get_sb_index(x, subsize) = 3;
       encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                subsize);
+                   subsize);
       break;
     default:
       assert("Invalid partition type.");
@@ -1211,13 +1257,14 @@ static void rd_use_partition(VP9_COMP *cpi,
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   int last_part_rate = INT_MAX;
-  int64_t last_part_dist = INT_MAX;
-  int split_rate = INT_MAX;
-  int64_t split_dist = INT_MAX;
+  int64_t last_part_dist = INT64_MAX;
+  int64_t last_part_rd = INT64_MAX;
   int none_rate = INT_MAX;
-  int64_t none_dist = INT_MAX;
+  int64_t none_dist = INT64_MAX;
+  int64_t none_rd = INT64_MAX;
   int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
+  int64_t chosen_dist = INT64_MAX;
+  int64_t chosen_rd = INT64_MAX;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
@@ -1246,7 +1293,8 @@ static void rd_use_partition(VP9_COMP *cpi,
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
-  if (cpi->sf.adjust_partitioning_from_last_frame) {
+  if (cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
@@ -1272,7 +1320,11 @@ static void rd_use_partition(VP9_COMP *cpi,
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row, mi_col, bsize);
-      none_rate += x->partition_cost[pl][PARTITION_NONE];
+
+      if (none_rate < INT_MAX) {
+        none_rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
+      }
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
       mi_8x8[0]->mbmi.sb_type = bs_type;
@@ -1300,9 +1352,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
 
@@ -1324,9 +1376,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         *get_sb_index(x, subsize) = 1;
         rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt,
                          subsize, get_block_context(x, subsize), INT64_MAX);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1352,9 +1404,9 @@ static void rd_use_partition(VP9_COMP *cpi,
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
                          i != 3);
-        if (rt == INT_MAX || dt == INT_MAX) {
+        if (rt == INT_MAX || dt == INT64_MAX) {
           last_part_rate = INT_MAX;
-          last_part_dist = INT_MAX;
+          last_part_dist = INT64_MAX;
           break;
         }
         last_part_rate += rt;
@@ -1367,16 +1419,19 @@ static void rd_use_partition(VP9_COMP *cpi,
 
   pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                mi_row, mi_col, bsize);
-  if (last_part_rate < INT_MAX)
+  if (last_part_rate < INT_MAX) {
     last_part_rate += x->partition_cost[pl][partition];
+    last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
+  }
 
   if (cpi->sf.adjust_partitioning_from_last_frame
+      && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
       && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
       && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
-    split_rate = 0;
-    split_dist = 0;
+    chosen_rate = 0;
+    chosen_dist = 0;
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
     // Split partition.
@@ -1403,46 +1458,44 @@ static void rd_use_partition(VP9_COMP *cpi,
 
       restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-      if (rt == INT_MAX || dt == INT_MAX) {
-        split_rate = INT_MAX;
-        split_dist = INT_MAX;
+      if (rt == INT_MAX || dt == INT64_MAX) {
+        chosen_rate = INT_MAX;
+        chosen_dist = INT64_MAX;
         break;
       }
 
+      chosen_rate += rt;
+      chosen_dist += dt;
+
       if (i != 3)
         encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize);
 
-      split_rate += rt;
-      split_dist += dt;
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
                                    mi_row + y_idx, mi_col + x_idx,
                                    split_subsize);
-      split_rate += x->partition_cost[pl][PARTITION_NONE];
+      chosen_rate += x->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
                                  mi_row, mi_col, bsize);
-    if (split_rate < INT_MAX) {
-      split_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      chosen_rate = split_rate;
-      chosen_dist = split_dist;
+    if (chosen_rate < INT_MAX) {
+      chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
     }
   }
 
   // If last_part is better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist)
-      < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) {
+  if (last_part_rd < chosen_rd) {
     mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = subsize;
     chosen_rate = last_part_rate;
     chosen_dist = last_part_dist;
+    chosen_rd = last_part_rd;
   }
   // If none was better set the partitioning to that...
-  if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)
-      > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) {
+  if (none_rd < chosen_rd) {
     if (bsize >= BLOCK_8X8)
       *(get_sb_partitioning(x, bsize)) = bsize;
     chosen_rate = none_rate;
@@ -1454,7 +1507,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
   if ( bsize == BLOCK_64X64)
-    assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
+    assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX);
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
@@ -1912,14 +1965,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
-    assert(best_dist < INT_MAX);
+    assert(best_dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                             int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
   int mi_col;
 
@@ -1935,28 +1988,45 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
 
     BLOCK_SIZE i;
     MACROBLOCK *x = &cpi->mb;
-    for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[i];
-      const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
-      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
-        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
-          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
-            get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+
+    if (cpi->sf.adaptive_pred_interp_filter) {
+      for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+        const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+        for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
+          for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
+            for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
+              get_block_context(x, i)->pred_interp_filter = SWITCHABLE;
+      }
     }
 
     vp9_zero(cpi->mb.pred_mv);
 
-    if (cpi->sf.use_lastframe_partitioning ||
-        cpi->sf.use_one_partition_size_always ) {
+    if ((cpi->sf.partition_search_type == SEARCH_PARTITION &&
+         cpi->sf.use_lastframe_partitioning) ||
+        cpi->sf.partition_search_type == FIXED_PARTITION ||
+        cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION) {
       const int idx_str = cm->mode_info_stride * mi_row + mi_col;
       MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
       MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
 
       cpi->mb.source_variance = UINT_MAX;
-      if (cpi->sf.use_one_partition_size_always) {
+      if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+                         cpi->sf.always_this_block_size);
+        rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+                         &dummy_rate, &dummy_dist, 1);
+      } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+                 cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+        // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+        // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+        // map to the same thing.
+        BLOCK_SIZE bsize;
         set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+        bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
+        set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
         rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
                          &dummy_rate, &dummy_dist, 1);
       } else {
@@ -2098,9 +2168,10 @@ static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs,
   }
 }
 
-static void reset_skip_txfm_size_b(VP9_COMMON *cm, MODE_INFO **mi_8x8,
-                                   int mis, TX_SIZE max_tx_size, int bw, int bh,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static void reset_skip_txfm_size_b(const VP9_COMMON *cm, int mis,
+                                   TX_SIZE max_tx_size, int bw, int bh,
+                                   int mi_row, int mi_col,
+                                   MODE_INFO **mi_8x8) {
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
     return;
   } else {
@@ -2130,19 +2201,18 @@ static void reset_skip_txfm_size_sb(VP9_COMMON *cm, MODE_INFO **mi_8x8,
   bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, bs, bs, mi_row,
-                           mi_col, bsize);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, bs, mi_row, mi_col,
+                           mi_8x8);
   } else if (bw == bs && bh < bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, bs, hbs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cm, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs,
-                           mi_row + hbs, mi_col, bsize);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row, mi_col,
+                           mi_8x8);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row + hbs,
+                           mi_col, mi_8x8 + hbs * mis);
   } else if (bw < bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cm, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col + hbs, bsize);
-
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row, mi_col,
+                           mi_8x8);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row,
+                           mi_col + hbs, mi_8x8 + hbs);
   } else {
     const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
@@ -2225,7 +2295,7 @@ typedef enum {
 } motion_vector_context;
 
 static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                          MB_PREDICTION_MODE mode, int mi_row, int mi_col) {
+                          MB_PREDICTION_MODE mode) {
   mbmi->interp_filter = EIGHTTAP;
   mbmi->mode = mode;
   mbmi->mv[0].as_int = 0;
@@ -2252,47 +2322,43 @@ static INLINE int get_block_col(int b32i, int b16i, int b8i) {
   return ((b32i & 1) << 2) + ((b16i & 1) << 1) + (b8i & 1);
 }
 
-static void rtc_use_partition(VP9_COMP *cpi,
-                              const TileInfo *const tile,
-                              MODE_INFO **mi_8x8,
-                              TOKENEXTRA **tp, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
-                              int do_recon) {
+static void nonrd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, int *rate, int64_t *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int mis = cm->mode_info_stride;
   int br, bc;
   int i, j;
-  int chosen_rate = INT_MAX;
-  int64_t chosen_dist = INT_MAX;
   MB_PREDICTION_MODE mode = DC_PRED;
   int rows = MIN(MI_BLOCK_SIZE, tile->mi_row_end - mi_row);
   int cols = MIN(MI_BLOCK_SIZE, tile->mi_col_end - mi_col);
 
-  int mi_8x8_width = num_8x8_blocks_wide_lookup[bsize];
-  int mi_8x8_hight = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+  int bh = num_8x8_blocks_high_lookup[bsize];
 
-  int brate;
-  int64_t bdist;
+  int brate = 0;
+  int64_t bdist = 0;
   *rate = 0;
   *dist = 0;
 
   // find prediction mode for each 8x8 block
-  for (br = 0; br < rows; br += mi_8x8_hight) {
-    for (bc = 0; bc < cols; bc += mi_8x8_width) {
+  for (br = 0; br < rows; br += bh) {
+    for (bc = 0; bc < cols; bc += bw) {
       int row = mi_row + br;
       int col = mi_col + bc;
-      int bh = 0, bw = 0;
+
       BLOCK_SIZE bs = find_partition_size(bsize, rows - br, cols - bc,
                                           &bh, &bw);
 
       set_offsets(cpi, tile, row, col, bs);
 
       if (cm->frame_type != KEY_FRAME)
-        vp9_pick_inter_mode(cpi, x, tile, row, col, &brate, &bdist, bs);
+        vp9_pick_inter_mode(cpi, x, tile, row, col,
+                            &brate, &bdist, bs);
       else
-        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode, row, col);
+        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode);
 
       *rate += brate;
       *dist += bdist;
@@ -2302,16 +2368,10 @@ static void rtc_use_partition(VP9_COMP *cpi,
           xd->mi_8x8[j * mis + i] = xd->mi_8x8[0];
     }
   }
-
-  encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
-
-  *rate = chosen_rate;
-  *dist = chosen_dist;
 }
 
-static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                              int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON * const cm = &cpi->common;
+static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+                                int mi_row, TOKENEXTRA **tp) {
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -2324,12 +2384,27 @@ static void encode_rtc_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     int dummy_rate;
     int64_t dummy_dist;
 
-    const int idx_str = cm->mode_info_stride * mi_row + mi_col;
-    MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
     cpi->mb.source_variance = UINT_MAX;
 
-    rtc_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_16X16,
-                     &dummy_rate, &dummy_dist, 1);
+    if (cpi->sf.partition_search_type == FIXED_PARTITION) {
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          cpi->sf.always_this_block_size,
+                          &dummy_rate, &dummy_dist);
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
+    } else if (cpi->sf.partition_search_type == VAR_BASED_FIXED_PARTITION ||
+               cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      // TODO(debargha): Implement VAR_BASED_PARTITION as a separate case.
+      // Currently both VAR_BASED_FIXED_PARTITION/VAR_BASED_PARTITION
+      // map to the same thing.
+      BLOCK_SIZE bsize = get_nonrd_var_based_fixed_partition(cpi,
+                                                             mi_row,
+                                                             mi_col);
+      nonrd_use_partition(cpi, tile, tp, mi_row, mi_col,
+                          bsize, &dummy_rate, &dummy_dist);
+      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, BLOCK_64X64);
+    } else {
+      assert(0);
+    }
   }
 }
 // end RTC play code
@@ -2385,7 +2460,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
   set_prev_mi(cm);
 
-  if (cpi->sf.use_pick_mode) {
+  if (cpi->sf.use_nonrd_pick_mode) {
     // Initialize internal buffer pointers for rtc coding, where non-RD
     // mode decision is used and hence no buffer pointer swap needed.
     int i;
@@ -2421,10 +2496,10 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
-            if (cpi->sf.use_pick_mode)
-              encode_rtc_sb_row(cpi, &tile, mi_row, &tp);
+            if (cpi->sf.use_nonrd_pick_mode)
+              encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
             else
-              encode_sb_row(cpi, &tile, mi_row, &tp);
+              encode_rd_sb_row(cpi, &tile, mi_row, &tp);
           }
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2687,7 +2762,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
 
   x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
                    (cpi->oxcf.aq_mode != COMPLEXITY_AQ) &&
-                   !cpi->sf.use_pick_mode;
+                   !cpi->sf.use_nonrd_pick_mode;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -2724,6 +2799,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
       sum_intra_stats(&cm->counts, mi);
+    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -2733,19 +2809,17 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       setup_pre_planes(xd, ref, cfg, mi_row, mi_col, &xd->block_refs[ref]->sf);
     }
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
-  }
 
-  if (!is_inter_block(mbmi)) {
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else if (!x->skip) {
-    mbmi->skip = 1;
-    vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
-  } else {
-    mbmi->skip = 1;
-    if (output_enabled)
-      cm->counts.skip[vp9_get_skip_context(xd)][1]++;
-    reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    if (!x->skip) {
+      mbmi->skip = 1;
+      vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
+      vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    } else {
+      mbmi->skip = 1;
+      if (output_enabled)
+        cm->counts.skip[vp9_get_skip_context(xd)][1]++;
+      reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
+    }
   }
 
   if (output_enabled) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 19421aaac..513730e43 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -51,7 +51,7 @@ void vp9_subtract_block_c(int rows, int cols,
   }
 }
 
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
@@ -62,22 +62,6 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  subtract_plane(x, bsize, 0);
-}
-
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  int i;
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    subtract_plane(x, bsize, i);
-}
-
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  vp9_subtract_sby(x, bsize);
-  vp9_subtract_sbuv(x, bsize);
-}
-
 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
 typedef struct vp9_token_state vp9_token_state;
 
@@ -121,10 +105,9 @@ static int trellis_get_coeff_context(const int16_t *scan,
   return pt;
 }
 
-static void optimize_b(MACROBLOCK *mb,
-                       int plane, int block, BLOCK_SIZE plane_bsize,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       TX_SIZE tx_size) {
+static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+                       TX_SIZE tx_size, MACROBLOCK *mb,
+                       struct optimize_ctx *ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
@@ -150,6 +133,11 @@ static void optimize_b(MACROBLOCK *mb,
   const scan_order *so = get_scan(xd, tx_size, type, block);
   const int16_t *scan = so->scan;
   const int16_t *nb = so->neighbors;
+  ENTROPY_CONTEXT *a, *l;
+  int tx_x, tx_y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &tx_x, &tx_y);
+  a = &ctx->ta[plane][tx_x];
+  l = &ctx->tl[plane][tx_y];
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -323,29 +311,6 @@ static void optimize_b(MACROBLOCK *mb,
   *a = *l = (final_eob > 0);
 }
 
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
-                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  optimize_b(mb, plane, block, plane_bsize,
-             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
-}
-
-static void optimize_init_b(int plane, BLOCK_SIZE bsize,
-                            struct encode_b_args *args) {
-  const MACROBLOCKD *xd = &args->x->e_mbd;
-  const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-
-  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
-}
-
 static INLINE void fdct32x32(int rd_transform,
                              const int16_t *src, int16_t *dst, int src_stride) {
   if (rd_transform)
@@ -357,22 +322,21 @@ static INLINE void fdct32x32(int rd_transform,
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
-  uint16_t *eob = &p->eobs[block];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   int i, j;
-  int16_t *src_diff;
+  const int16_t *src_diff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan_order = &vp9_default_scan_orders[TX_32X32];
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -380,7 +344,6 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                            scan_order->iscan);
       break;
     case TX_16X16:
-      scan_order = &vp9_default_scan_orders[TX_16X16];
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -388,7 +351,6 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      scan_order = &vp9_default_scan_orders[TX_8X8];
       vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -396,7 +358,6 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      scan_order = &vp9_default_scan_orders[TX_4X4];
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
@@ -435,7 +396,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+    optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
   } else {
     ctx->ta[plane][i] = p->eobs[block] > 0;
     ctx->tl[plane][j] = p->eobs[block] > 0;
@@ -469,8 +430,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
-  struct encode_b_args *const args = arg;
-  MACROBLOCK *const x = args->x;
+  MACROBLOCK *const x = (MACROBLOCK *)arg;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -482,24 +442,14 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
 
   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 
-  if (p->eobs[block] == 0)
-    return;
-
-  xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+  if (p->eobs[block] > 0)
+    xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
-
-  vp9_subtract_sby(x, bsize);
-  if (x->optimize)
-    optimize_init_b(0, bsize, &arg);
-
-  vp9_foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1,
-                                         &arg);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  vp9_subtract_plane(x, bsize, 0);
+  vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, x);
 }
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -507,17 +457,22 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  int plane;
 
-  if (!x->skip_recode)
-    vp9_subtract_sb(x, bsize);
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (!x->skip_recode)
+      vp9_subtract_plane(x, bsize, plane);
 
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; ++i)
-      optimize_init_b(i, bsize, &arg);
-  }
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
 
-  vp9_foreach_transformed_block(xd, bsize, encode_block, &arg);
+    vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+  }
 }
 
 static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -548,7 +503,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   // if (x->optimize)
-  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+  //   optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
@@ -613,11 +568,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
-        mode = xd->mi_8x8[0]->bmi[block].as_mode;
-      else
-        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-
+      mode = plane == 0 ? get_y_mode(xd->mi_8x8[0], block) : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 515935f9b..dcf6e8759 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -21,14 +21,12 @@ extern "C" {
 #endif
 
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 
 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index af710a8f4..507969951 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -224,35 +224,29 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   }
 }
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h) {
-  vp9_clear_system_state();
-  vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree);
-  if (mvc_flag_v)
-    build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp);
-  if (mvc_flag_h)
-    build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* ctx, int usehp) {
+  vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+static void inc_mvs(const int_mv mv[2], const MV ref[2], int is_compound,
                     nmv_context_counts *counts) {
   int i;
   for (i = 0; i < 1 + is_compound; ++i) {
-    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
-                      mv[i].as_mv.col - ref[i].as_mv.col };
+    const MV diff = { mv[i].as_mv.row - ref[i].row,
+                      mv[i].as_mv.col - ref[i].col };
     vp9_inc_mv(&diff, counts);
   }
 }
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]) {
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int is_compound = has_second_ref(mbmi);
-  nmv_context_counts *counts = &cpi->common.counts.mv;
+  nmv_context_counts *counts = &cm->counts.mv;
 
   if (mbmi->sb_type < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index f0463bbd3..f16b2c17c 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -25,14 +25,11 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
 
-void vp9_build_nmv_cost_table(int *mvjoint,
-                              int *mvcost[2],
-                              const nmv_context* const mvctx,
-                              int usehp,
-                              int mvc_flag_v,
-                              int mvc_flag_h);
-
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context* mvctx, int usehp);
+
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ddb901dd1..8e454e694 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -8,31 +8,34 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <math.h>
 #include <limits.h>
+#include <math.h>
 #include <stdio.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"  // setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
+
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12config.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_entropymv.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_vaq.h"
-#include "./vpx_scale_rtcd.h"
-// TODO(jkoleszar): for setup_dst_planes
-#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 #define OUTPUT_FPF 0
 
@@ -133,9 +136,8 @@ static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) {
   return 1;
 }
 
-static void output_stats(const VP9_COMP *cpi,
-                         struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS *stats) {
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct vpx_codec_pkt_list *pktlist) {
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -354,7 +356,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
 }
 
 static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -370,7 +372,7 @@ static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
   }
 }
 
-static unsigned int zz_motion_search(const VP9_COMP *cpi, const MACROBLOCK *x) {
+static unsigned int zz_motion_search(const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const src = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
@@ -592,7 +594,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
         int_mv mv, tmp_mv;
 
         xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
-        motion_error = zz_motion_search(cpi, x);
+        motion_error = zz_motion_search(x);
         // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
@@ -628,7 +630,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           int gf_motion_error;
 
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-          gf_motion_error = zz_motion_search(cpi, x);
+          gf_motion_error = zz_motion_search(x);
 
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
@@ -675,7 +677,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby(x, bsize);
+          vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -788,7 +790,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
-    output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
@@ -807,7 +809,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
 
-  vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
+  vp9_extend_frame_borders(lst_yv12);
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
@@ -839,40 +841,6 @@ static double bitcost(double prob) {
   return -(log(prob) / log(2.0));
 }
 
-static int64_t estimate_modemvcost(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *fpstats) {
-#if 0
-  int mv_cost;
-  int mode_cost;
-
-  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
-  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
-  double av_intra = (1.0 - av_pct_inter);
-
-  double zz_cost;
-  double motion_cost;
-  double intra_cost;
-
-  zz_cost = bitcost(av_pct_inter - av_pct_motion);
-  motion_cost = bitcost(av_pct_motion);
-  intra_cost = bitcost(av_intra);
-
-  // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9)
-  // to match the scaling of number of bits by 512.
-  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
-
-  // Produce a crude estimate of the overhead cost from modes. We shift (<< 9)
-  // to match the scaling of number of bits by 512.
-  mode_cost =
-    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
-           (av_pct_motion * motion_cost) +
-           (av_intra * intra_cost)) * cpi->common.MBs) << 9;
-
-  // TODO(paulwilkins): Fix overhead costs for extended Q range.
-#endif
-  return 0;
-}
-
 static double calc_correction_factor(double err_per_mb,
                                      double err_divisor,
                                      double pt_low,
@@ -1007,9 +975,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   }
 }
 
-void vp9_end_second_pass(VP9_COMP *cpi) {
-}
-
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
 static double get_prediction_decay_rate(const VP9_COMMON *cm,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 83e337b6d..03c0e20bb 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -89,7 +89,6 @@ void vp9_end_first_pass(struct VP9_COMP *cpi);
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
-void vp9_end_second_pass(struct VP9_COMP *cpi);
 int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
                               int section_target_bandwitdh);
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 4b642e2b6..a88d5ecf8 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -88,8 +88,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
 #define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map) {
+                       int64_t ts_start, int64_t ts_end, unsigned int flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
   int row, col, active_end;
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 1c00c462d..ff63c0d0d 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -63,8 +63,7 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map);
+                       int64_t ts_start, int64_t ts_end, unsigned int flags);
 
 
 /**\brief Get the next source buffer to encode
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 44c1f9078..d3e19b4b9 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -132,7 +132,6 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
   return err;
 }
 static int find_best_16x16_intra(VP9_COMP *cpi,
-                                 int mb_y_offset,
                                  MB_PREDICTION_MODE *pbest_mode) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -173,10 +172,7 @@ static void update_mbgraph_mb_stats
   int mb_y_offset,
   YV12_BUFFER_CONFIG *golden_ref,
   int_mv *prev_golden_ref_mv,
-  int gld_y_offset,
   YV12_BUFFER_CONFIG *alt_ref,
-  int_mv *prev_alt_ref_mv,
-  int arf_y_offset,
   int mb_row,
   int mb_col
 ) {
@@ -193,7 +189,7 @@ static void update_mbgraph_mb_stats
   xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
 
   // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, mb_y_offset,
+  intra_error = find_best_16x16_intra(cpi,
                                       &stats->ref[INTRA_FRAME].m.mode);
   if (intra_error <= 0)
     intra_error = 1;
@@ -277,8 +273,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
       MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
 
       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
-                              golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset,
+                              golden_ref, &gld_left_mv, alt_ref,
                               mb_row, mb_col);
       arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
       gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
@@ -374,10 +369,10 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
       cpi->static_mb_pct = 0;
 
     cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(&cm->seg);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(&cm->seg);
   }
 
   // Free localy allocated storage
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 10dee52ed..7d6fd3b99 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1525,7 +1525,7 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int block) {
+                          const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
@@ -1544,7 +1544,6 @@ int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
   int best_sad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride,
                              0x7fffffff) +
       mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit);
-  MV *best_mv = &xd->mi_8x8[0]->bmi[block].as_mv[0].as_mv;
   *best_mv = *ref_mv;
 
   for (r = row_min; r < row_max; ++r) {
@@ -1578,13 +1577,12 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1684,13 +1682,12 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          const MV *center_mv, int n) {
+                          const MV *center_mv, MV *best_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const what = x->plane[0].src.buf;
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV *best_mv = &xd->mi_8x8[0]->bmi[n].as_mv[0].as_mv;
   MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1830,11 +1827,7 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
   const uint8_t *const in_what = xd->plane[0].pre[0].buf;
   const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride +
                                              ref_mv->col];
-  unsigned int thissad;
-
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  MV this_mv;
-
   const int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
@@ -1846,15 +1839,13 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_mv.row = ref_mv->row + neighbors[j].row;
-      this_mv.col = ref_mv->col + neighbors[j].col;
-
+      const MV this_mv = {ref_mv->row + neighbors[j].row,
+                          ref_mv->col + neighbors[j].col};
       if (is_mv_in(x, &this_mv)) {
         const uint8_t *check_here = &in_what[this_mv.row * in_what_stride +
                                                 this_mv.col];
-        thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
-                              bestsad);
-
+        unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here,
+                                           in_what_stride, bestsad);
         if (thissad < bestsad) {
           thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
@@ -1876,15 +1867,15 @@ int vp9_refining_search_sad_c(const MACROBLOCK *x,
     }
   }
 
-  this_mv.row = ref_mv->row * 8;
-  this_mv.col = ref_mv->col * 8;
-
-  if (bestsad < INT_MAX)
+  if (bestsad < INT_MAX) {
+    unsigned int unused;
+    const MV mv = {ref_mv->row * 8, ref_mv->col * 8};
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
-  else
+                      &unused) +
+        mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
+  } else {
     return INT_MAX;
+  }
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index ff4b1df75..586a74c9c 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -119,7 +119,7 @@ typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
                                     int distance,
                                     const vp9_variance_fn_ptr_t *fn_ptr,
                                     int *mvjcost, int *mvcost[2],
-                                    const MV *center_mv, int n);
+                                    const MV *center_mv, MV *best_mv);
 
 typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x,
                                         MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index dd170143d..d7bac6f0b 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -14,6 +14,8 @@
 
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx/internal/vpx_psnr.h"
+#include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
@@ -25,12 +27,12 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
 
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -38,9 +40,6 @@
 #include "vp9/encoder/vp9_vaq.h"
 #include "vp9/encoder/vp9_resize.h"
 
-#include "vpx_ports/vpx_timer.h"
-
-void vp9_entropy_mode_init();
 void vp9_coef_tree_initialize();
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -264,7 +263,7 @@ static void setup_in_frame_q_adj(VP9_COMP *cpi) {
     // Clear down the complexity map used for rd
     vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
 
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     // Select delta coding method
@@ -298,7 +297,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
 
     // Clear down the segment features.
     vp9_clearall_segfeatures(seg);
@@ -311,7 +310,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     // Scan frames from current to arf frame.
@@ -364,7 +363,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
         // Disable segmentation and clear down features if alt ref
         // is not active for this group
 
-        vp9_disable_segmentation((VP9_PTR)cpi);
+        vp9_disable_segmentation(seg);
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
@@ -491,18 +490,6 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi) {
   sf->thresh_mult[THR_D207_PRED] += 2500;
   sf->thresh_mult[THR_D63_PRED] += 2500;
 
-  // disable using golden frame modes if golden frames are not being used
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX) {
-    sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;
-  }
-
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -661,6 +648,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 32;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -696,6 +684,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->reference_masking = 1;
     sf->auto_mv_step_size = 1;
 
+    sf->disable_split_var_thresh = 64;
     sf->disable_filter_search_var_thresh = 200;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
@@ -713,9 +702,9 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
   }
-  if (speed == 5) {
+  if (speed >= 5) {
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->use_one_partition_size_always = 1;
+    sf->partition_search_type = FIXED_PARTITION;
     sf->always_this_block_size = BLOCK_16X16;
     sf->tx_size_search_method = frame_is_intra_only(cm) ?
       USE_FULL_RD : USE_LARGESTALL;
@@ -745,6 +734,7 @@ static void set_good_speed_feature(VP9_COMMON *cm,
     sf->mode_skip_start = 6;
   }
 }
+
 static void set_rt_speed_feature(VP9_COMMON *cm,
                                  SPEED_FEATURES *sf,
                                  int speed) {
@@ -850,6 +840,9 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     sf->adaptive_rd_thresh = 5;
     sf->auto_min_max_partition_size = frame_is_intra_only(cm) ?
         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+    sf->adjust_partitioning_from_last_frame =
+        cm->last_frame_type == KEY_FRAME || (0 ==
+        (cm->current_video_frame + 1) % sf->last_partitioning_redo_frequency);
     sf->subpel_force_stop = 1;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
@@ -857,13 +850,21 @@ static void set_rt_speed_feature(VP9_COMMON *cm,
     }
     sf->frame_parameter_update = 0;
     sf->encode_breakout_thresh = 1000;
-
     sf->search_method = FAST_HEX;
   }
   if (speed >= 6) {
-    sf->always_this_block_size = BLOCK_16X16;
-    sf->use_pick_mode = 1;
-    sf->encode_breakout_thresh = 1000;
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+    sf->search_method = HEX;
+  }
+  if (speed >= 7) {
+    sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+    sf->use_nonrd_pick_mode = 1;
+    sf->search_method = NSTEP;
+  }
+  if (speed >= 8) {
+    int i;
+    for (i = 0; i < BLOCK_SIZES; ++i)
+      sf->disable_inter_mode_mask[i] = 14;   // only search NEARESTMV (0)
   }
 }
 
@@ -901,7 +902,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
   sf->reference_masking = 0;
-  sf->use_one_partition_size_always = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
@@ -923,8 +924,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_fast_lpf_pick = 0;
   sf->use_fast_coef_updates = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
-  sf->use_pick_mode = 0;
+  sf->use_nonrd_pick_mode = 0;
   sf->encode_breakout_thresh = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->disable_inter_mode_mask[i] = 0;
 
   switch (cpi->oxcf.mode) {
     case MODE_BESTQUALITY:
@@ -972,6 +975,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   if (cpi->encode_breakout && cpi->oxcf.mode == MODE_REALTIME &&
       sf->encode_breakout_thresh > cpi->encode_breakout)
     cpi->encode_breakout = sf->encode_breakout_thresh;
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
+    sf->adaptive_pred_interp_filter = 0;
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
@@ -1719,7 +1725,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
                                              (MAX_MVSEARCH_STEPS * 8) + 1));
 
-  vp9_create_common(cm);
+  vp9_rtcd();
 
   cpi->use_svc = 0;
 
@@ -2015,10 +2021,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
     return;
 
   if (cpi && (cpi->common.current_video_frame > 0)) {
-    if (cpi->pass == 2) {
-      vp9_end_second_pass(cpi);
-    }
-
 #if CONFIG_INTERNAL_STATS
 
     vp9_clear_system_state();
@@ -2035,11 +2037,11 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
 
       if (cpi->b_calculate_psnr) {
         const double total_psnr =
-            vp9_mse2psnr((double)cpi->total_samples, 255.0,
-                         (double)cpi->total_sq_error);
+            vpx_sse_to_psnr((double)cpi->total_samples, 255.0,
+                            (double)cpi->total_sq_error);
         const double totalp_psnr =
-            vp9_mse2psnr((double)cpi->totalp_samples, 255.0,
-                         (double)cpi->totalp_sq_error);
+            vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0,
+                            (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                                 cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
@@ -2220,7 +2222,7 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                                           w, h);
     psnr->sse[1 + i] = sse;
     psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vp9_mse2psnr(samples, 255.0, (double) sse);
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse);
 
     total_sse += sse;
     total_samples += samples;
@@ -2228,7 +2230,8 @@ static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
 
   psnr->sse[0] = total_sse;
   psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vp9_mse2psnr((double)total_samples, 255.0, (double)total_sse);
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0,
+                                  (double)total_sse);
 }
 
 static void generate_psnr_packet(VP9_COMP *cpi) {
@@ -2677,8 +2680,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
     vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0);
   }
 
-  vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                 cm->subsampling_x, cm->subsampling_y);
+  vp9_extend_frame_inner_borders(cm->frame_to_show);
 }
 
 static void scale_references(VP9_COMP *cpi) {
@@ -2895,7 +2897,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       vp9_save_coding_context(cpi);
       cpi->dummy_packing = 1;
-      if (!cpi->sf.use_pick_mode)
+      if (!cpi->sf.use_nonrd_pick_mode)
         vp9_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
@@ -3064,6 +3066,9 @@ static void get_ref_frame_flags(VP9_COMP *cpi) {
   if (cpi->gold_is_last)
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+    cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+
   if (cpi->alt_is_last)
     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
 
@@ -3454,8 +3459,8 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
-                         cpi->active_map_enabled ? cpi->active_map : NULL))
+  if (vp9_lookahead_push(cpi->lookahead,
+                         sd, time_stamp, end_time, frame_flags))
     res = -1;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -3583,8 +3588,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         // TODO(agrange) merge these two functions.
         vp9_configure_arnr_filter(cpi, frames_to_arf, cpi->rc.gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
-        vp9_extend_frame_borders(&cpi->alt_ref_buffer,
-                                 cm->subsampling_x, cm->subsampling_y);
+        vp9_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -3720,7 +3724,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
                                       cm->width, cm->height);
 
     if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf, cm->subsampling_x, cm->subsampling_y);
+      vp9_extend_frame_borders(buf);
   }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
@@ -3885,15 +3889,15 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
     return -1;
 
   if (!map) {
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
     return 0;
   }
 
   // Set the segmentation Map
-  vp9_set_segmentation_map((VP9_PTR)cpi, map);
+  vp9_set_segmentation_map(cpi, map);
 
   // Activate segmentation.
-  vp9_enable_segmentation((VP9_PTR)cpi);
+  vp9_enable_segmentation(seg);
 
   // Set up the quant, LF and breakout threshold segment data
   for (i = 0; i < MAX_SEGMENTS; i++) {
@@ -3917,7 +3921,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
 
   // Initialize the feature data structure
   // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+  vp9_set_segment_data(seg, &feature_data[0][0], SEGMENT_DELTADATA);
 
   return 0;
 }
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 7bcceedb8..019cb1388 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -218,6 +218,22 @@ typedef enum {
   ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;
 
+typedef enum {
+  // Search partitions using RD/NONRD criterion
+  SEARCH_PARTITION = 0,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION = 1,
+
+  // Use a fixed size partition in every 64X64 SB, where the size is
+  // determined based on source variance
+  VAR_BASED_FIXED_PARTITION = 2,
+
+  // Use an arbitrary partitioning scheme based on source variance within
+  // a 64X64 SB
+  VAR_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
 typedef struct {
   // Frame level coding parameter update
   int frame_parameter_update;
@@ -304,16 +320,6 @@ typedef struct {
 
   // TODO(JBB): remove this as its no longer used.
 
-  // If set partition size will always be always_this_block_size.
-  int use_one_partition_size_always;
-
-  // Skip rectangular partition test when partition type none gives better
-  // rd than partition type split.
-  int less_rectangular_check;
-
-  // Disable testing non square partitions. (eg 16x32)
-  int use_square_partition_only;
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -322,9 +328,18 @@ typedef struct {
   // TODO(JBB): Remove this.
   int reference_masking;
 
-  // Used in conjunction with use_one_partition_size_always.
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split.
+  int less_rectangular_check;
+
+  // Disable testing non square partitions. (eg 16x32)
+  int use_square_partition_only;
+
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
@@ -396,11 +411,15 @@ typedef struct {
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 
   // This flag controls the use of non-RD mode decision.
-  int use_pick_mode;
+  int use_nonrd_pick_mode;
 
   // This variable sets the encode_breakout threshold. Currently, it is only
   // enabled in real time mode.
   int encode_breakout_thresh;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
+  int disable_inter_mode_mask[BLOCK_SIZES];
 } SPEED_FEATURES;
 
 typedef struct {
@@ -699,8 +718,6 @@ static YV12_BUFFER_CONFIG *get_ref_frame_buffer(VP9_COMP *cpi,
 
 void vp9_encode_frame(VP9_COMP *cpi);
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
-
 void vp9_set_speed_features(VP9_COMP *cpi);
 
 int vp9_calc_ss_err(const YV12_BUFFER_CONFIG *source,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 383d92751..edb14ea42 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -8,39 +8,28 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
 
-#include "vp9/common/vp9_pragmas.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_entropymode.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_rdopt.h"
+
+#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_rdopt.h"
 
 static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                     const TileInfo *const tile,
-                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                     int_mv *tmp_mv, int *rate_mv) {
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
@@ -98,8 +87,28 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1,
-                         &cpi->fn_ptr[bsize], &ref_mv.as_mv, &tmp_mv->as_mv);
+  if (cpi->sf.search_method == FAST_HEX) {
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == HEX) {
+    bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
+                             &cpi->fn_ptr[bsize], 1,
+                             &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == SQUARE) {
+    bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
+                                &cpi->fn_ptr[bsize], 1,
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == BIGDIA) {
+    bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
+                                &cpi->fn_ptr[bsize], 1,
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else {
+    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                     sadpb, further_steps, 1,
+                                     &cpi->fn_ptr[bsize],
+                                     &ref_mv.as_mv, &tmp_mv->as_mv);
+  }
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
@@ -133,12 +142,12 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv) {
+                                    MV *tmp_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
   int dis;
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
@@ -154,10 +163,10 @@ static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  tmp_mv->as_mv.col >>= 3;
-  tmp_mv->as_mv.row >>= 3;
+  tmp_mv->col >>= 3;
+  tmp_mv->row >>= 3;
 
-  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+  cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
                                cpi->common.allow_high_precision_mv,
                                x->errorperbit,
                                &cpi->fn_ptr[bsize],
@@ -183,16 +192,18 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame;
+  MB_PREDICTION_MODE this_mode, best_mode = ZEROMV;
+  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
-  int64_t this_rd;
-  static const int cost[4]= { 0, 50, 75, 100 };
+  int64_t this_rd = INT64_MAX;
+  static const int cost[4]= { 0, 2, 4, 6 };
 
   const int64_t inter_mode_thresh = 300;
   const int64_t intra_mode_cost = 50;
@@ -230,7 +241,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     int rate_mv = 0;
-
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
 
@@ -240,38 +250,64 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
     clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
 
+    mbmi->ref_frame[0] = ref_frame;
+
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate = cost[INTER_OFFSET(this_mode)];
+      int rate = cost[INTER_OFFSET(this_mode)]
+          << (num_pels_log2_lookup[bsize] - 4);
       int64_t dist;
+      if (cpi->sf.disable_inter_mode_mask[bsize] &
+          (1 << INTER_OFFSET(this_mode)))
+        continue;
 
       if (this_mode == NEWMV) {
+        if (this_rd < (1 << num_pels_log2_lookup[bsize]))
+          continue;
+
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
             full_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
                                      &frame_mv[NEWMV][ref_frame], &rate_mv);
 
         if (frame_mv[NEWMV][ref_frame].as_int == INVALID_MV)
           continue;
+
+        sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+                                &frame_mv[NEWMV][ref_frame].as_mv);
+      }
+
+      if (frame_mv[this_mode][ref_frame].as_int == 0) {
+        dist = x->mode_sad[ref_frame][INTER_OFFSET(ZEROMV)];
+      } else if (this_mode != NEARESTMV &&
+                 frame_mv[NEARESTMV][ref_frame].as_int ==
+                     frame_mv[this_mode][ref_frame].as_int) {
+        dist = x->mode_sad[ref_frame][INTER_OFFSET(NEARESTMV)];
+      } else {
+        mbmi->mode = this_mode;
+        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)] =
+            cpi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+                                   pd->dst.buf, pd->dst.stride, INT_MAX);
       }
 
-      dist = x->mode_sad[ref_frame][INTER_OFFSET(this_mode)];
       this_rd = rate + dist;
 
       if (this_rd < best_rd) {
         best_rd = this_rd;
-        mbmi->mode = this_mode;
-        mbmi->ref_frame[0] = ref_frame;
-        mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-        xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-        mbmi->uv_mode = this_mode;
+        best_mode = this_mode;
+        best_ref_frame = ref_frame;
       }
     }
   }
 
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if (best_rd > inter_mode_thresh) {
-    struct macroblock_plane *const p = &x->plane[0];
-    struct macroblockd_plane *const pd = &xd->plane[0];
     for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
       vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
                               mbmi->tx_size, this_mode,
@@ -288,18 +324,9 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         mbmi->mode = this_mode;
         mbmi->ref_frame[0] = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
+        mbmi->mv[0].as_int = INVALID_MV;
       }
     }
   }
-
-  // Perform sub-pixel motion search, if NEWMV is chosen
-  if (mbmi->mode == NEWMV) {
-    ref_frame = mbmi->ref_frame[0];
-    sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                            &frame_mv[NEWMV][ref_frame]);
-    mbmi->mv[0].as_int = frame_mv[NEWMV][ref_frame].as_int;
-    xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-  }
-
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_psnr.c b/vp9/encoder/vp9_psnr.c
deleted file mode 100644
index 58294e15a..000000000
--- a/vp9/encoder/vp9_psnr.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx_scale/yv12config.h"
-
-#define MAX_PSNR 100
-
-double vp9_mse2psnr(double samples, double peak, double mse) {
-  double psnr;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-  else
-    psnr = MAX_PSNR;  // Limit to prevent / 0
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
diff --git a/vp9/encoder/vp9_psnr.h b/vp9/encoder/vp9_psnr.h
deleted file mode 100644
index ffe00ed2c..000000000
--- a/vp9/encoder/vp9_psnr.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_PSNR_H_
-#define VP9_ENCODER_VP9_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-double vp9_mse2psnr(double samples, double peak, double mse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_PSNR_H_
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 372c36221..4ab8995e3 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -9,15 +9,16 @@
  */
 
 #include <math.h>
+
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_quant_common.h"
-
 #include "vp9/common/vp9_seg_common.h"
 
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -151,44 +152,40 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
 }
 
 void vp9_init_quantizer(VP9_COMP *cpi) {
-  int i, q;
   VP9_COMMON *const cm = &cpi->common;
+  int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
-    // y
     for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
-                               : vp9_ac_quant(q, 0);
+      // y
+      quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
+                     : vp9_ac_quant(q, 0);
       invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant);
       cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->y_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->y_dequant[q][i] = quant;
-    }
 
-    // uv
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
-                               : vp9_ac_quant(q, cm->uv_ac_delta_q);
+      // uv
+      quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q)
+                     : vp9_ac_quant(q, cm->uv_ac_delta_q);
       invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant);
       cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
-    }
 
 #if CONFIG_ALPHA
-    // alpha
-    for (i = 0; i < 2; ++i) {
-      const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
-                               : vp9_ac_quant(q, cm->a_ac_delta_q);
+      // alpha
+      quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q)
+                     : vp9_ac_quant(q, cm->a_ac_delta_q);
       invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant);
       cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       cpi->a_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->a_dequant[q][i] = quant;
-    }
 #endif
+    }
 
     for (i = 2; i < 8; i++) {
       cpi->y_quant[q][i] = cpi->y_quant[q][1];
@@ -214,7 +211,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   }
 }
 
-void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
+void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
@@ -246,7 +243,7 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
   x->plane[3].zbin = cpi->a_zbin[qindex];
   x->plane[3].round = cpi->a_round[qindex];
-  x->plane[3].zbin_extra = (int16_t)zbin_extra;
+  x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7);
   xd->plane[3].dequant = cm->a_dequant[qindex];
 #endif
 
@@ -272,26 +269,17 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  // Clear Zbin mode boost for default case
   cpi->zbin_mode_boost = 0;
-
-  // MB level quantizer setup
-  vp9_mb_init_quantizer(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->mb);
 }
 
 void vp9_set_quantizer(struct VP9_COMP *cpi, int q) {
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
 
+  // quantizer has to be reinitialized with vp9_init_quantizer() if any
+  // delta_q changes.
   cm->base_qindex = q;
-
-  // if any of the delta_q values are changing update flag will
-  // have to be set.
   cm->y_dc_delta_q = 0;
   cm->uv_dc_delta_q = 0;
   cm->uv_ac_delta_q = 0;
-
-  // quantizer has to be reinitialized if any delta_q changes.
-  // As there are not any here for now this is inactive code.
-  // if(update)
-  //    vp9_init_quantizer(cpi);
 }
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 680cf4aec..f356b125c 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -28,7 +28,7 @@ void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
 void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
 
-void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x);
+void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
 
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index f78ebfe18..89aa82140 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -8,23 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
@@ -958,17 +959,10 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   }
 
   // Clip the active best and worst quality values to limits.
-  if (active_worst_quality > rc->worst_quality)
-    active_worst_quality = rc->worst_quality;
-
-  if (active_best_quality < rc->best_quality)
-    active_best_quality = rc->best_quality;
-
-  if (active_best_quality > rc->worst_quality)
-    active_best_quality = rc->worst_quality;
-
-  if (active_worst_quality < active_best_quality)
-    active_worst_quality = active_best_quality;
+  active_best_quality = clamp(active_best_quality,
+                              rc->best_quality, rc->worst_quality);
+  active_worst_quality = clamp(active_worst_quality,
+                               active_best_quality, rc->worst_quality);
 
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
@@ -1041,7 +1035,7 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
   // JBB : This is realtime mode.  In real time mode the first frame
   // should be larger. Q of 0 is disabled because we force tx size to be
   // 16x16...
-  if (cpi->sf.use_pick_mode) {
+  if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->common.current_video_frame == 0)
       q /= 3;
     if (q == 0)
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5ff2c84ae..3effbcab3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -8,34 +8,38 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
 
-#include "vp9/common/vp9_pragmas.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -272,18 +276,12 @@ static void set_block_thresholds(VP9_COMP *cpi) {
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCK *x = &cpi->mb;
-  int qindex, i;
+  int i;
 
   vp9_clear_system_state();
 
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
-
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
-  cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+  cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
   x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
   x->errorperbit += (x->errorperbit == 0);
@@ -295,21 +293,22 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   set_block_thresholds(cpi);
 
-  fill_token_costs(x->token_costs, cm->fc.coef_probs);
+  if (!cpi->sf.use_nonrd_pick_mode) {
+    fill_token_costs(x->token_costs, cm->fc.coef_probs);
 
-  if (!cpi->sf.use_pick_mode) {
     for (i = 0; i < PARTITION_CONTEXTS; i++)
       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
                       vp9_partition_tree);
+  }
 
+  if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1) {
     fill_mode_costs(cpi);
 
     if (!frame_is_intra_only(cm)) {
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc,
-                               cm->allow_high_precision_mv, 1, 1);
+                               &cm->fc.nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)x->inter_mode_cost[i],
@@ -524,15 +523,15 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-    sqcoeff += (unsigned) coeff[i] * coeff[i];
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
   }
 
   *ssz = sqcoeff;
@@ -563,18 +562,16 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
-  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
+  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-                   x->token_costs[tx_size][type][ref];
-  const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
+                   x->token_costs[tx_size][type][is_inter_block(mbmi)];
   uint8_t *p_tok = x->token_cache;
-  int pt = combine_entropy_contexts(above_ec, left_ec);
+  int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
 
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
-                                      : get_uv_tx_size(mbmi) == tx_size);
+                              : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
@@ -584,7 +581,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
     int band_left = *band_count++;
 
     // dc token
-    int v = qcoeff_ptr[0];
+    int v = qcoeff[0];
     int prev_t = vp9_dct_value_tokens_ptr[v].token;
     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     p_tok[0] = vp9_pt_energy_class[prev_t];
@@ -595,7 +592,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
       const int rc = scan[c];
       int t;
 
-      v = qcoeff_ptr[rc];
+      v = qcoeff[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
       pt = get_coef_context(nb, p_tok, c);
       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
@@ -692,10 +689,16 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 }
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h) {
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
   int i;
   switch (tx_size) {
     case TX_4X4:
@@ -732,9 +735,6 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
   struct rdcost_block_args args = { 0 };
   args.x = x;
   args.best_rd = ref_best_rd;
@@ -742,9 +742,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   if (plane == 0)
     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
 
-  vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
-                           pd->above_context, pd->left_context,
-                           num_4x4_w, num_4x4_h);
+  vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
 
@@ -935,27 +933,23 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void super_block_yrd(VP9_COMP *cpi,
-                            MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, int64_t *psse, BLOCK_SIZE bs,
-                            int64_t txfm_cache[TX_MODES],
-                            int64_t ref_best_rd) {
+static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  const int b_inter_mode = is_inter_block(mbmi);
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   TX_SIZE tx_size;
 
-
   assert(bs == mbmi->sb_type);
-  if (b_inter_mode)
-    vp9_subtract_sby(x, bs);
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-      (cpi->sf.tx_size_search_method != USE_FULL_RD &&
-       !b_inter_mode)) {
+  vp9_subtract_plane(x, bs, 0);
+
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
@@ -964,8 +958,7 @@ static void super_block_yrd(VP9_COMP *cpi,
     return;
   }
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
-      b_inter_mode) {
+  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
@@ -983,6 +976,36 @@ static void super_block_yrd(VP9_COMP *cpi,
     *psse = sse[mbmi->tx_size];
 }
 
+static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                  int64_t *distortion, int *skip,
+                                  int64_t *psse, BLOCK_SIZE bs,
+                                  int64_t txfm_cache[TX_MODES],
+                                  int64_t ref_best_rd) {
+  int64_t sse[TX_SIZES];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+  assert(bs == mbmi->sb_type);
+  if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+    choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
+                             ref_best_rd, bs);
+  } else {
+    int r[TX_SIZES][2], s[TX_SIZES];
+    int64_t d[TX_SIZES];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
+      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
+    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                             skip, txfm_cache, bs);
+  }
+  if (psse)
+    *psse = sse[mbmi->tx_size];
+}
+
+
 static int conditional_skipintra(MB_PREDICTION_MODE mode,
                                  MB_PREDICTION_MODE best_intra_mode) {
   if (mode == D117_PRED &&
@@ -1012,10 +1035,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      int64_t *bestdistortion,
                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-  int rate = 0;
-  int64_t distortion;
+
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
@@ -1024,8 +1046,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                                             src_stride)];
   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
                                                        dst_stride)];
-  int16_t *src_diff, *coeff;
-
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -1043,6 +1063,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
 
     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
@@ -1054,55 +1076,50 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           continue;
     }
 
-    rate = bmode_costs[mode];
-    distortion = 0;
-
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        int64_t ssz;
-        const scan_order *so;
-        const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
-        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
-        TX_TYPE tx_type;
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                            p->src_diff);
+        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
-        src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8,
-                           src, src_stride,
-                           dst, dst_stride);
-
-        tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-        so = &vp9_scan_orders[TX_4X4][tx_type];
-
-        if (tx_type != DCT_DCT)
+        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+        if (xd->lossless) {
+          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          vp9_fwht4x4(src_diff, coeff, 8);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+                          p->eobs[block]);
+        } else {
+          int64_t unused;
+          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-        else
-          x->fwd_txm4x4(src_diff, coeff, 8);
-
-        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-
-        ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                             so->scan, so->neighbors);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                      16, &ssz) >> 2;
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
-
-        if (tx_type != DCT_DCT)
-          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                               dst, pd->dst.stride, tx_type);
-        else
-          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
-                       16);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                         dst, dst_stride, p->eobs[block]);
+        }
       }
     }
 
@@ -1243,8 +1260,8 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
     mic->mbmi.mode = mode;
 
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, local_tx_cache, best_rd);
+    intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+        &s, NULL, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1279,7 +1296,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
-static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
+static void super_block_uvrd(MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1293,8 +1310,11 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
   if (ref_best_rd < 0)
     goto term;
 
-  if (is_inter_block(mbmi))
-    vp9_subtract_sbuv(x, bsize);
+  if (is_inter_block(mbmi)) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp9_subtract_plane(x, bsize, plane);
+  }
 
   *rate = 0;
   *distortion = 0;
@@ -1326,6 +1346,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -1336,9 +1357,9 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
+    xd->mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(cpi, x, &this_rate_tokenonly,
+    super_block_uvrd(x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1356,7 +1377,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
       if (!x->select_txfm_size) {
         int i;
         struct macroblock_plane *const p = x->plane;
-        struct macroblockd_plane *const pd = x->e_mbd.plane;
+        struct macroblockd_plane *const pd = xd->plane;
         for (i = 1; i < MAX_MB_PLANE; ++i) {
           p[i].coeff    = ctx->coeff_pbuf[i][2];
           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
@@ -1377,25 +1398,21 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
+  xd->mi_8x8[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
-static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
+static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
                               BLOCK_SIZE bsize) {
-  int64_t this_rd;
-  int64_t this_sse;
+  int64_t unused;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
-                   skippable, &this_sse, bsize, INT64_MAX);
-  *rate = *rate_tokenonly +
-          x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
-  this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-
-  return this_rd;
+  super_block_uvrd(x, rate_tokenonly, distortion,
+                   skippable, &unused, bsize, INT64_MAX);
+  *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
@@ -1408,8 +1425,8 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
-    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                   bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+    rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                   skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
@@ -1423,8 +1440,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
                        int mode_context) {
   MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+  const int segment_id = x->e_mbd.mi_8x8[0]->mbmi.segment_id;
 
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
@@ -1449,7 +1465,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int *rate_mv);
 
 static int labels2mode(MACROBLOCK *x, int i,
-                       MB_PREDICTION_MODE this_mode,
+                       MB_PREDICTION_MODE mode,
                        int_mv *this_mv, int_mv *this_second_mv,
                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                        int_mv seg_mvs[MAX_REF_FRAMES],
@@ -1459,23 +1475,18 @@ static int labels2mode(MACROBLOCK *x, int i,
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi_8x8[0];
   MB_MODE_INFO *mbmi = &mic->mbmi;
-  int cost = 0, thismvcost = 0;
+  int thismvcost = 0;
   int idx, idy;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int has_second_rf = has_second_ref(mbmi);
 
-  /* We have to be careful retrieving previously-encoded motion vectors.
-   Ones from this macroblock have to be pulled from the BLOCKD array
-   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  MB_PREDICTION_MODE m;
-
   // the only time we should do costing for new motion vector or mode
   // is when we are on a new label  (jbb May 08, 2007)
-  switch (m = this_mode) {
+  switch (mode) {
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
+      thismvcost += vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       if (has_second_rf) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
@@ -1487,14 +1498,12 @@ static int labels2mode(MACROBLOCK *x, int i,
     case NEARESTMV:
       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
       break;
     case NEARMV:
       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
       if (has_second_rf)
-        this_second_mv->as_int =
-            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
+        this_second_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
       this_mv->as_int = 0;
@@ -1505,22 +1514,19 @@ static int labels2mode(MACROBLOCK *x, int i,
       break;
   }
 
-  cost = cost_mv_ref(cpi, this_mode,
-                     mbmi->mode_context[mbmi->ref_frame[0]]);
-
   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   if (has_second_rf)
     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-  mic->bmi[i].as_mode = m;
+  mic->bmi[i].as_mode = mode;
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
                  &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  cost += thismvcost;
-  return cost;
+  return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
+            thismvcost;
 }
 
 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
@@ -1611,7 +1617,7 @@ typedef struct {
 } SEG_RDSTAT;
 
 typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
+  int_mv *ref_mv[2];
   int_mv mvp;
 
   int64_t segment_rd;
@@ -1722,6 +1728,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        if (cpi->sf.disable_inter_mode_mask[bsize] & (1 << mode_idx))
+          continue;
 
         // if we're near/nearest and mv == 0,0, compare to zeromv
         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
@@ -1819,52 +1827,52 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           // adjust src pointer for this block
           mi_buf_shift(x, i);
 
-          vp9_set_mv_search_range(x, &bsi->ref_mv->as_mv);
+          vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
           if (cpi->sf.search_method == HEX) {
             bestsme = vp9_hex_search(x, &mvp_full,
                                      step_param,
                                      sadpb, 1, v_fn_ptr, 1,
-                                     &bsi->ref_mv->as_mv,
+                                     &bsi->ref_mv[0]->as_mv,
                                      &new_mv->as_mv);
           } else if (cpi->sf.search_method == SQUARE) {
             bestsme = vp9_square_search(x, &mvp_full,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
+                                        &bsi->ref_mv[0]->as_mv,
                                         &new_mv->as_mv);
           } else if (cpi->sf.search_method == BIGDIA) {
             bestsme = vp9_bigdia_search(x, &mvp_full,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
+                                        &bsi->ref_mv[0]->as_mv,
                                         &new_mv->as_mv);
           } else {
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
-                                             &bsi->ref_mv->as_mv,
+                                             &bsi->ref_mv[0]->as_mv,
                                              &new_mv->as_mv);
           }
 
           // Should we do a full search (best quality only)
           if (cpi->oxcf.mode == MODE_BESTQUALITY ||
               cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
+            int_mv *const best_mv = &mi->bmi[i].as_mv[0];
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
-
             thissme = cpi->full_search_sad(x, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           &bsi->ref_mv->as_mv, i);
-
+                                           &bsi->ref_mv[0]->as_mv,
+                                           &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
-              new_mv->as_int = mi->bmi[i].as_mv[0].as_int;
+              new_mv->as_int = best_mv->as_int;
             } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              mi->bmi[i].as_mv[0].as_int = new_mv->as_int;
+              // The full search result is actually worse so re-instate the
+              // previous best vector
+              best_mv->as_int = new_mv->as_int;
             }
           }
 
@@ -1872,7 +1880,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             int distortion;
             cpi->find_fractional_mv_step(x,
                                          &new_mv->as_mv,
-                                         &bsi->ref_mv->as_mv,
+                                         &bsi->ref_mv[0]->as_mv,
                                          cm->allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
                                          cpi->sf.subpel_force_stop,
@@ -1919,7 +1927,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         bsi->rdstat[i][mode_idx].brate =
             labels2mode(x, i, this_mode, &mode_mv[this_mode],
                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
-                        bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                        bsi->ref_mv[0], bsi->ref_mv[1], x->nmvjointcost,
                         x->mvcost, cpi);
 
 
@@ -2028,7 +2036,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
-                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                  bsi->ref_mv[0], bsi->ref_mv[1], x->nmvjointcost,
                   x->mvcost, cpi);
 
       br += bsi->rdstat[i][mode_idx].brate;
@@ -2083,8 +2091,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   vp9_zero(*bsi);
 
   bsi->segment_rd = best_rd;
-  bsi->ref_mv = best_ref_mv;
-  bsi->second_ref_mv = second_best_ref_mv;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
   bsi->mvthresh = mvthresh;
 
@@ -2369,7 +2377,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2379,10 +2387,10 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
 
-  int_mv pred_mv[3];
-  pred_mv[0] = mbmi->ref_mvs[ref][0];
-  pred_mv[1] = mbmi->ref_mvs[ref][1];
-  pred_mv[2] = x->pred_mv[ref];
+  MV pred_mv[3];
+  pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref].as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2395,7 +2403,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_set_mv_search_range(x, &ref_mv.as_mv);
+  vp9_set_mv_search_range(x, &ref_mv);
 
   // Work out the size of the first step in the mv step search.
   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
@@ -2440,7 +2448,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]].as_mv;
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
 
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
@@ -2451,24 +2459,24 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.search_method == FAST_HEX) {
     bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
                                   &cpi->fn_ptr[bsize], 1,
-                                  &ref_mv.as_mv, &tmp_mv->as_mv);
+                                  &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == HEX) {
     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
                              &cpi->fn_ptr[bsize], 1,
-                             &ref_mv.as_mv, &tmp_mv->as_mv);
+                             &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
-                                &ref_mv.as_mv, &tmp_mv->as_mv);
+                                &ref_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
                                      &cpi->fn_ptr[bsize],
-                                     &ref_mv.as_mv, &tmp_mv->as_mv);
+                                     &ref_mv, &tmp_mv->as_mv);
   }
 
   x->mv_col_min = tmp_col_min;
@@ -2478,7 +2486,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
                                  &cpi->fn_ptr[bsize],
@@ -2487,7 +2495,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref]);
   }
-  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
+  *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
@@ -2728,7 +2736,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       frame_mv[refs[0]].as_int == 0 &&
       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
+    int rfc = mbmi->mode_context[refs[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
@@ -2743,17 +2751,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       assert(this_mode == ZEROMV);
       if (num_refs == 1) {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0))
           return INT64_MAX;
       } else {
         if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
+             mode_mv[NEARESTMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARESTMV][refs[1]].as_int == 0) ||
             (c3 >= c1 &&
-             mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
-             mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
+             mode_mv[NEARMV][refs[0]].as_int == 0 &&
+             mode_mv[NEARMV][refs[1]].as_int == 0))
           return INT64_MAX;
       }
     }
@@ -2784,8 +2792,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode,
-                        mbmi->mode_context[mbmi->ref_frame[0]]);
+  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
   if (!(*mode_excluded))
     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -3003,8 +3010,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t rdcosty = INT64_MAX;
 
     // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
-                    bsize, txfm_cache, ref_best_rd);
+    inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+                          bsize, txfm_cache, ref_best_rd);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3019,7 +3026,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3204,25 +3211,69 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // All modes from vp9_mode_order that use this frame as any ref
     static const int ref_frame_mask_all[] = {
-        0x123291, 0x25c444, 0x39b722
+        0x0, 0x123291, 0x25c444, 0x39b722
     };
     // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
     // this frame as their primary ref
     static const int ref_frame_mask_fixedmv[] = {
-        0x121281, 0x24c404, 0x080102
+        0x0, 0x121281, 0x24c404, 0x080102
     };
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip modes for missing references
-      mode_skip_mask |= ref_frame_mask_all[ref_frame - LAST_FRAME];
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
     } else if (cpi->sf.reference_masking) {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame - LAST_FRAME];
+          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
           break;
         }
       }
     }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    }
+  }
+
+  // If the segment skip feature is enabled....
+  // then do nothing if the current mode is not allowed..
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    const int inter_non_zero_mode_mask = 0x1F7F7;
+    mode_skip_mask |= inter_non_zero_mode_mask;
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      const int altref_zero_mask =
+          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      mode_skip_mask |= altref_zero_mask;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARA);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARESTA);
+    }
+  }
+
+  // TODO(JBB): This is to make up for the fact that we don't have sad
+  // functions that work when the block size reads outside the umv.  We
+  // should fix this either by making the motion search just work on
+  // a representative block in the boundary ( first ) and then implement a
+  // function that does sads when inside the border..
+  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
+    const int new_modes_mask =
+        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
+        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
+    mode_skip_mask |= new_modes_mask;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3236,7 +3287,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
-    int64_t total_sse = INT_MAX;
+    int64_t total_sse = INT64_MAX;
     int early_term = 0;
 
     // Look at the reference frame of the best mode so far and set the
@@ -3270,15 +3321,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    if (ref_frame != INTRA_FRAME &&
+        cpi->sf.disable_inter_mode_mask[bsize] & (1 << INTER_OFFSET(this_mode)))
+      continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      // Do not allow compound prediction if the segment level reference
-      // frame feature is in use as in this case there can only be one
-      // reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-        continue;
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
           vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
         continue;
@@ -3294,47 +3343,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
     }
 
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
-      continue;
-    // Disable this drop out case if the ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
-            ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-    // TODO(JBB): This is to make up for the fact that we don't have sad
-    // functions that work when the block size reads outside the umv.  We
-    // should fix this either by making the motion search just work on
-    // a representative block in the boundary ( first ) and then implement a
-    // function that does sads when inside the border..
-    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
-      continue;
-    }
-
     if (ref_frame == INTRA_FRAME) {
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
@@ -3389,8 +3397,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
-                      bsize, tx_cache, best_rd);
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+                            bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
@@ -4146,7 +4154,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
@@ -4365,7 +4373,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
     *returnrate = INT_MAX;
-    *returndistortion = INT_MAX;
+    *returndistortion = INT64_MAX;
     return best_rd;
   }
 
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 96cea4216..6b85d67f8 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -80,10 +80,10 @@ void vp9_init_me_luts();
 void vp9_set_mbmode_and_mvs(MACROBLOCKD *xd, MB_PREDICTION_MODE mode,
                             const MV *mv);
 
-void vp9_get_entropy_contexts(TX_SIZE tx_size,
-    ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
-    const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
-    int num_4x4_w, int num_4x4_h);
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[16],
+                              ENTROPY_CONTEXT t_left[16]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 0766b5107..4e6efaeb9 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_resize.h"
 
@@ -24,9 +25,6 @@
 #define SUBPEL_MASK               ((1 << SUBPEL_BITS) - 1)
 #define INTERP_PRECISION_BITS     32
 
-#define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
-
 typedef int16_t interp_kernel[INTERP_TAPS];
 
 // Filters for interpolation (0.5-band) - note this also filters integer pels.
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 509717e0a..49fd7bb48 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -10,29 +10,25 @@
 
 
 #include <limits.h>
+
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_segmentation.h"
+
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
-void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+#include "vp9/encoder/vp9_segmentation.h"
 
+void vp9_enable_segmentation(struct segmentation *seg) {
   seg->enabled = 1;
   seg->update_map = 1;
   seg->update_data = 1;
 }
 
-void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+void vp9_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
 }
 
-void vp9_set_segmentation_map(VP9_PTR ptr,
-                              unsigned char *segmentation_map) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
+void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map) {
   struct segmentation *const seg = &cpi->common.seg;
 
   // Copy in the new segmentation map
@@ -44,12 +40,9 @@ void vp9_set_segmentation_map(VP9_PTR ptr,
   seg->update_data = 1;
 }
 
-void vp9_set_segment_data(VP9_PTR ptr,
+void vp9_set_segment_data(struct segmentation *seg,
                           signed char *feature_data,
                           unsigned char abs_delta) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg = &cpi->common.seg;
-
   seg->abs_delta = abs_delta;
 
   vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h
index 8238892e2..66c51a21b 100644
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-void vp9_enable_segmentation(VP9_PTR ptr);
-void vp9_disable_segmentation(VP9_PTR ptr);
+void vp9_enable_segmentation(struct segmentation *seg);
+void vp9_disable_segmentation(struct segmentation *seg);
 
 void vp9_disable_segfeature(struct segmentation *seg,
                             int segment_id,
@@ -30,7 +30,7 @@ void vp9_clear_segdata(struct segmentation *seg,
                        SEG_LVL_FEATURES feature_id);
 // Valid values for a segment are 0 to 3
 // Segmentation map is arrange as [Rows][Columns]
-void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
+void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map);
 
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
@@ -42,7 +42,7 @@ void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
 //
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
 // the absolute values given).
-void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
                           unsigned char abs_delta);
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 84fb35e21..fdc210646 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -150,8 +150,7 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r) {
+                                              vp9_prob upd) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
   vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index ab5659bf7..8e9c0c62a 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -33,8 +33,7 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r);
+                                              vp9_prob upd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 004047773..623311659 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -20,7 +20,6 @@
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_psnr.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -125,8 +124,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride,
-                                              int error_thresh) {
+                                              int stride) {
   MACROBLOCK *x = &cpi->mb;
   MACROBLOCKD* const xd = &x->e_mbd;
   int step_param;
@@ -268,8 +266,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
                 (cpi,
                  cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
                  cpi->frames[frame]->y_buffer + mb_y_offset,
-                 cpi->frames[frame]->y_stride,
-                 THRESH_LOW);
+                 cpi->frames[frame]->y_stride);
 #endif
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 2be00ff62..e8179f302 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,8 +23,8 @@
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int dct_value_cost[DCT_MAX_VALUE * 2];
-const int *vp9_dct_value_cost_ptr;
+static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
+const int16_t *vp9_dct_value_cost_ptr;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -199,6 +199,12 @@ static INLINE void add_token_no_extra(TOKENEXTRA **t,
   ++counts[token];
 }
 
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index ea86240be..063c0bafe 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -47,7 +47,7 @@ struct VP9_COMP;
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-extern const int *vp9_dct_value_cost_ptr;
+extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
  *  fields are not.
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 600029b19..c71c171bb 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -83,7 +83,7 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     seg->abs_delta = SEGMENT_DELTADATA;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index a448b3c52..9fb611504 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -25,7 +25,6 @@ VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
 VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
-VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c
 VP9_COMMON_SRCS-yes += common/vp9_idct.c
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
 VP9_COMMON_SRCS-yes += common/vp9_blockd.h
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index a4162e942..28c60d1b8 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -355,6 +355,13 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
 
   oxcf->ss_number_layers = cfg.ss_number_layers;
 
+  if (oxcf->ss_number_layers > 1) {
+    memcpy(oxcf->ss_target_bitrate, cfg.ss_target_bitrate,
+           sizeof(cfg.ss_target_bitrate));
+  } else if (oxcf->ss_number_layers == 1) {
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+  }
+
   oxcf->ts_number_layers = cfg.ts_number_layers;
 
   if (oxcf->ts_number_layers > 1) {
@@ -1160,6 +1167,7 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
       9999,               /* kf_max_dist */
 
       VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
+      {0},                /* ss_target_bitrate */
       1,                  /* ts_number_layers */
       {0},                /* ts_target_bitrate */
       {0},                /* ts_rate_decimator */
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 27dd6f625..6679f89be 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -38,7 +38,6 @@ VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
 VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
@@ -50,7 +49,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
-VP9_CX_SRCS-yes += encoder/vp9_psnr.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
diff --git a/vpx/internal/vpx_psnr.h b/vpx/internal/vpx_psnr.h
new file mode 100644
index 000000000..07d81bb8d
--- /dev/null
+++ b/vpx/internal/vpx_psnr.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_INTERNAL_VPX_PSNR_H_
+#define VPX_INTERNAL_VPX_PSNR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index adce47637..c7837244f 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -13,6 +13,7 @@
  * VP9 SVC encoding support via libvpx
  */
 
+#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -40,6 +41,7 @@ _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
 #define SUPERFRAME_SLOTS (8)
 #define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
 #define OPTION_BUFFER_SIZE 256
+#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
 
 static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
 static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
@@ -47,16 +49,20 @@ static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
 typedef struct SvcInternal {
   char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
   char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
+  char quantizers_keyframe[OPTION_BUFFER_SIZE];  // set by
+                                                 // vpx_svc_set_quantizers
   char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors
 
   // values extracted from option, quantizers
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int quantizer_keyframe[VPX_SS_MAX_LAYERS];
   int quantizer[VPX_SS_MAX_LAYERS];
 
   // accumulated statistics
-  double psnr_in_layer[VPX_SS_MAX_LAYERS];
-  uint32_t bytes_in_layer[VPX_SS_MAX_LAYERS];
+  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
+  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
 
   // codec encoding values
   int width;    // width of highest layer
@@ -268,7 +274,8 @@ static vpx_codec_err_t set_option_encoding_mode(SvcContext *svc_ctx,
 }
 
 static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
-                                              const char *quantizer_values) {
+                                              const char *quantizer_values,
+                                              const int is_keyframe) {
   char *input_string;
   char *token;
   const char *delim = ",";
@@ -279,6 +286,11 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
   SvcInternal *const si = get_svc_internal(svc_ctx);
 
   if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
+    if (is_keyframe) {
+      // If there non settings for key frame, we will apply settings from
+      // non key frame. So just simply return here.
+      return VPX_CODEC_INVALID_PARAM;
+    }
     input_string = strdup(DEFAULT_QUANTIZER_VALUES);
   } else {
     input_string = strdup(quantizer_values);
@@ -299,7 +311,12 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
     } else {
       q = 0;
     }
-    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    if (is_keyframe) {
+      si->quantizer_keyframe[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers]
+      = q;
+    } else {
+      si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+    }
   }
   if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
@@ -384,6 +401,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   char *option_name;
   char *option_value;
   char *input_ptr;
+  int is_keyframe_qaunt_set = 0;
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   if (options == NULL) return VPX_CODEC_OK;
@@ -409,8 +427,17 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
       res = parse_scale_factors(svc_ctx, option_value);
       if (res != VPX_CODEC_OK) break;
     } else if (strcmp("quantizers", option_name) == 0) {
-      res = parse_quantizer_values(svc_ctx, option_value);
+      res = parse_quantizer_values(svc_ctx, option_value, 0);
+      if (res != VPX_CODEC_OK) break;
+      if (!is_keyframe_qaunt_set) {
+        SvcInternal *const si = get_svc_internal(svc_ctx);
+        memcpy(get_svc_internal(svc_ctx)->quantizer_keyframe, si->quantizer,
+               sizeof(si->quantizer));
+      }
+    } else if (strcmp("quantizers-keyframe", option_name) == 0) {
+      res = parse_quantizer_values(svc_ctx, option_value, 1);
       if (res != VPX_CODEC_OK) break;
+      is_keyframe_qaunt_set = 1;
     } else {
       svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
       res = VPX_CODEC_INVALID_PARAM;
@@ -433,13 +460,19 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
 }
 
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizers) {
+                                       const char *quantizers,
+                                       const int is_for_keyframe) {
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
-  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  if (is_for_keyframe) {
+    strncpy(si->quantizers_keyframe, quantizers, sizeof(si->quantizers));
+    si->quantizers_keyframe[sizeof(si->quantizers_keyframe) - 1] = '\0';
+  } else {
+    strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+    si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  }
   return VPX_CODEC_OK;
 }
 
@@ -490,9 +523,13 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   // for first frame
   si->layers = svc_ctx->spatial_layers;
 
-  res = parse_quantizer_values(svc_ctx, si->quantizers);
+  res = parse_quantizer_values(svc_ctx, si->quantizers, 0);
   if (res != VPX_CODEC_OK) return res;
 
+  res = parse_quantizer_values(svc_ctx, si->quantizers_keyframe, 1);
+  if (res != VPX_CODEC_OK)
+    memcpy(si->quantizer_keyframe, si->quantizer, sizeof(si->quantizer));
+
   res = parse_scale_factors(svc_ctx, si->scale_factors);
   if (res != VPX_CODEC_OK) return res;
 
@@ -500,6 +537,34 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   res = parse_options(svc_ctx, si->options);
   if (res != VPX_CODEC_OK) return res;
 
+  // Assign target bitrate for each layer. We calculate the ratio
+  // from the resolution for now.
+  // TODO(Minghai): Optimize the mechanism of allocating bits after
+  // implementing svc two pass rate control.
+  if (si->layers > 1) {
+    int i;
+    float total = 0;
+    float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
+
+    for (i = 0; i < si->layers; ++i) {
+      int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers;
+      if (pos < VPX_SS_MAX_LAYERS && si->scaling_factor_den[pos] > 0) {
+        alloc_ratio[i] = (float)(si->scaling_factor_num[pos] * 1.0 /
+            si->scaling_factor_den[pos]);
+
+        alloc_ratio[i] *= alloc_ratio[i];
+        total += alloc_ratio[i];
+      }
+    }
+
+    for (i = 0; i < si->layers; ++i) {
+      if (total > 0) {
+        enc_cfg->ss_target_bitrate[i] = (unsigned int)
+            (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
+      }
+    }
+  }
+
   // modify encoder configuration
   enc_cfg->ss_number_layers = si->layers;
   enc_cfg->ts_number_layers = 1;  // Temporal layers not used in this encoder.
@@ -713,8 +778,15 @@ static void set_svc_parameters(SvcContext *svc_ctx,
     svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
   }
   layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
-  svc_params.min_quantizer = si->quantizer[layer_index];
-  svc_params.max_quantizer = si->quantizer[layer_index];
+
+  if (vpx_svc_is_keyframe(svc_ctx)) {
+    svc_params.min_quantizer = si->quantizer_keyframe[layer_index];
+    svc_params.max_quantizer = si->quantizer_keyframe[layer_index];
+  } else {
+    svc_params.min_quantizer = si->quantizer[layer_index];
+    svc_params.max_quantizer = si->quantizer[layer_index];
+  }
+
   svc_params.distance_from_i_frame = si->frame_within_gop;
 
   // Use buffer i for layer i LST
@@ -814,7 +886,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
       switch (cx_pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
           const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz);
-          si->bytes_in_layer[si->layer] += frame_pkt_size;
+          si->bytes_sum[si->layer] += frame_pkt_size;
           svc_log(svc_ctx, SVC_LOG_DEBUG,
                   "SVC frame: %d, layer: %d, size: %u\n",
                   si->encode_frame_count, si->layer, frame_pkt_size);
@@ -832,13 +904,23 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
           break;
         }
         case VPX_CODEC_PSNR_PKT: {
+          int i;
           svc_log(svc_ctx, SVC_LOG_DEBUG,
                   "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
                   "%2.3f  %2.3f  %2.3f  %2.3f \n",
                   si->encode_frame_count, si->layer,
                   cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
                   cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-          si->psnr_in_layer[si->layer] += cx_pkt->data.psnr.psnr[0];
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->encode_frame_count, si->layer,
+                  cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
+                  cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
+          for (i = 0; i < COMPONENTS; i++) {
+            si->psnr_sum[si->layer][i] += cx_pkt->data.psnr.psnr[i];
+            si->sse_sum[si->layer][i] += cx_pkt->data.psnr.sse[i];
+          }
           break;
         }
         default: {
@@ -916,11 +998,21 @@ void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
   si->frame_within_gop = 0;
 }
 
+static double calc_psnr(double d) {
+  if (d == 0) return 100;
+  return -10.0 * log(d) / log(10.0);
+}
+
 // dump accumulated statistics and reset accumulated values
 const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   int number_of_frames, number_of_keyframes, encode_frame_count;
-  int i;
+  int i, j;
   uint32_t bytes_total = 0;
+  double scale[COMPONENTS];
+  double psnr[COMPONENTS];
+  double mse[COMPONENTS];
+  double y_scale;
+
   SvcInternal *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
 
@@ -938,12 +1030,36 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
         (i == 1 || i == 3)) {
       number_of_frames -= number_of_keyframes;
     }
-    svc_log(svc_ctx, SVC_LOG_INFO, "Layer %d PSNR=[%2.3f], Bytes=[%u]\n", i,
-            (double)si->psnr_in_layer[i] / number_of_frames,
-            si->bytes_in_layer[i]);
-    bytes_total += si->bytes_in_layer[i];
-    si->psnr_in_layer[i] = 0;
-    si->bytes_in_layer[i] = 0;
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
+            i, (double)si->psnr_sum[i][0] / number_of_frames,
+            (double)si->psnr_sum[i][1] / number_of_frames,
+            (double)si->psnr_sum[i][2] / number_of_frames,
+            (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
+    // the following psnr calculation is deduced from ffmpeg.c#print_report
+    y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames;
+    scale[1] = y_scale;
+    scale[2] = scale[3] = y_scale / 4;  // U or V
+    scale[0] = y_scale * 1.5;           // total
+
+    for (j = 0; j < COMPONENTS; j++) {
+      psnr[j] = calc_psnr(si->sse_sum[i][j] / scale[j]);
+      mse[j] = si->sse_sum[i][j] * 255.0 * 255.0 / scale[j];
+    }
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall PSNR=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, psnr[0],
+            psnr[1], psnr[2], psnr[3]);
+    svc_log(svc_ctx, SVC_LOG_INFO,
+            "Layer %d Overall MSE=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, mse[0],
+            mse[1], mse[2], mse[3]);
+
+    bytes_total += si->bytes_sum[i];
+    // clear sums for next time
+    si->bytes_sum[i] = 0;
+    for (j = 0; j < COMPONENTS; ++j) {
+      si->psnr_sum[i][j] = 0;
+      si->sse_sum[i][j] = 0;
+    }
   }
 
   // only display statistics once
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 23742c8e8..e69d96efb 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -255,8 +255,8 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
 }
 
 
-const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t   *ctx,
-                                                vpx_codec_iter_t  *iter) {
+const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx,
+                                                vpx_codec_iter_t *iter) {
   const vpx_codec_cx_pkt_t *pkt = NULL;
 
   if (ctx) {
@@ -271,32 +271,30 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t   *ctx,
   }
 
   if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
-    /* If the application has specified a destination area for the
-     * compressed data, and the codec has not placed the data there,
-     * and it fits, copy it.
-     */
-    char *dst_buf = ctx->priv->enc.cx_data_dst_buf.buf;
-
-    if (dst_buf
-        && pkt->data.raw.buf != dst_buf
-        && pkt->data.raw.sz
-        + ctx->priv->enc.cx_data_pad_before
-        + ctx->priv->enc.cx_data_pad_after
-        <= ctx->priv->enc.cx_data_dst_buf.sz) {
-      vpx_codec_cx_pkt_t *modified_pkt = &ctx->priv->enc.cx_data_pkt;
-
-      memcpy(dst_buf + ctx->priv->enc.cx_data_pad_before,
-             pkt->data.raw.buf, pkt->data.raw.sz);
+    // If the application has specified a destination area for the
+    // compressed data, and the codec has not placed the data there,
+    // and it fits, copy it.
+    vpx_codec_priv_t *const priv = ctx->priv;
+    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+    if (dst_buf &&
+        pkt->data.raw.buf != dst_buf &&
+        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+            priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) {
+      vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+             pkt->data.raw.sz);
       *modified_pkt = *pkt;
       modified_pkt->data.raw.buf = dst_buf;
-      modified_pkt->data.raw.sz += ctx->priv->enc.cx_data_pad_before
-                                   + ctx->priv->enc.cx_data_pad_after;
+      modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before +
+                                       priv->enc.cx_data_pad_after;
       pkt = modified_pkt;
     }
 
     if (dst_buf == pkt->data.raw.buf) {
-      ctx->priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
-      ctx->priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
     }
   }
 
diff --git a/vpx/src/vpx_psnr.c b/vpx/src/vpx_psnr.c
new file mode 100644
index 000000000..05843acb6
--- /dev/null
+++ b/vpx/src/vpx_psnr.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx/internal/vpx_psnr.h"
+
+#define MAX_PSNR 100.0
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index f675fb684..98474ca91 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -64,7 +64,8 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
  * e.g., "60,53,39,33,27"
  */
 vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
-                                       const char *quantizer_values);
+                                       const char *quantizer_values,
+                                       const int is_for_keyframe);
 
 /**
  * Set SVC scale factors
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 111c87e53..98d1d567c 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -34,8 +34,10 @@ API_SRCS-yes                += vpx_decoder.h
 API_SRCS-yes                += src/vpx_encoder.c
 API_SRCS-yes                += vpx_encoder.h
 API_SRCS-yes                += internal/vpx_codec_internal.h
+API_SRCS-yes                += internal/vpx_psnr.h
 API_SRCS-yes                += src/vpx_codec.c
 API_SRCS-yes                += src/vpx_image.c
+API_SRCS-yes                += src/vpx_psnr.c
 API_SRCS-yes                += vpx_codec.h
 API_SRCS-yes                += vpx_codec.mk
 API_SRCS-yes                += vpx_frame_buffer.h
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 1d9f0c9b7..851ff1ae8 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -610,6 +610,13 @@ extern "C" {
      */
     unsigned int           ss_number_layers;
 
+    /*!\brief Target bitrate for each spatial layer.
+     *
+     * These values specify the target coding bitrate to be used for each
+     * spatial layer.
+     */
+    unsigned int           ss_target_bitrate[VPX_SS_MAX_LAYERS];
+
     /*!\brief Number of temporal coding layers.
      *
      * This value specifies the number of temporal layers to be used.
diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index da7c65d96..24743c8d6 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h
@@ -11,6 +11,8 @@
 #ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_
 #define VPX_PORTS_MEM_OPS_ALIGNED_H_
 
+#include "vpx/vpx_integer.h"
+
 /* \file
  * \brief Provides portable memory access primitives for operating on aligned
  *        data
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 7896dfee6..7b43eece2 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -81,9 +81,7 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
 }
 
 #if CONFIG_VP9
-static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
-                         int subsampling_x, int subsampling_y,
-                         int ext_size) {
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   const int c_w = ybf->uv_crop_width;
   const int c_h = ybf->uv_crop_height;
   const int c_ext_size = ext_size >> 1;
@@ -110,16 +108,14 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                int subsampling_x, int subsampling_y) {
-  extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  extend_frame(ybf, ybf->border);
 }
 
-void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                      int subsampling_x, int subsampling_y) {
+void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
   const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
                        VP9INNERBORDERINPIXELS : ybf->border;
-  extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+  extend_frame(ybf, inner_bw);
 }
 #endif  // CONFIG_VP9
 
diff --git a/vpx_scale/vpx_scale_rtcd.sh b/vpx_scale/vpx_scale_rtcd.sh
index a5faf1148..c26208c59 100644
--- a/vpx_scale/vpx_scale_rtcd.sh
+++ b/vpx_scale/vpx_scale_rtcd.sh
@@ -6,7 +6,7 @@ EOF
 forward_decls vpx_scale_forward_decls
 
 # Scaler functions
-if [ "CONFIG_SPATIAL_RESAMPLING" != "yes" ]; then
+if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
     prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
     prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
     prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
@@ -26,9 +26,9 @@ prototype void vpx_yv12_copy_y "const struct yv12_buffer_config *src_ybc, struct
 specialize vpx_yv12_copy_y neon
 
 if [ "$CONFIG_VP9" = "yes" ]; then
-    prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
+    prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf"
     specialize vp9_extend_frame_borders dspr2
 
-    prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
+    prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf"
     specialize vp9_extend_frame_inner_borders dspr2
 fi
diff --git a/vpxdec.c b/vpxdec.c
index 660f6136f..b69e55eeb 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -23,6 +23,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
 
 #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
@@ -298,10 +299,11 @@ int file_is_raw(struct VpxInputContext *input) {
   return is_raw;
 }
 
-void show_progress(int frame_in, int frame_out, unsigned long dx_time) {
-  fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\r",
+void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+  fprintf(stderr,
+          "%d decoded frames/%d showed frames in %"PRId64" us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
-          (float)frame_out * 1000000.0 / (float)dx_time);
+          (double)frame_out * 1000000.0 / (double)dx_time);
 }
 
 struct ExternalFrameBuffer {
@@ -493,7 +495,7 @@ int main_loop(int argc, const char **argv_) {
   int                    ec_enabled = 0;
   const VpxInterface *interface = NULL;
   const VpxInterface *fourcc_interface = NULL;
-  unsigned long          dx_time = 0;
+  uint64_t dx_time = 0;
   struct arg               arg;
   char                   **argv, **argi, **argj;
 
@@ -803,7 +805,7 @@ int main_loop(int argc, const char **argv_) {
         }
 
         vpx_usec_timer_mark(&timer);
-        dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
+        dx_time += vpx_usec_timer_elapsed(&timer);
       }
     }
 
diff --git a/vpxenc.c b/vpxenc.c
index 8cd5a103a..c61d83e41 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1399,8 +1399,8 @@ static void show_psnr(struct stream_state  *stream) {
     return;
 
   fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-  ovpsnr = vp8_mse2psnr((double)stream->psnr_samples_total, 255.0,
-                        (double)stream->psnr_sse_total);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, 255.0,
+                       (double)stream->psnr_sse_total);
   fprintf(stderr, " %.3f", ovpsnr);
 
   for (i = 0; i < 4; i++) {
diff --git a/vpxstats.c b/vpxstats.c
index 70cea3ee7..5f88f8d35 100644
--- a/vpxstats.c
+++ b/vpxstats.c
@@ -120,16 +120,3 @@ void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
 vpx_fixed_buf_t stats_get(stats_io_t *stats) {
   return stats->buf;
 }
-
-double vp8_mse2psnr(double samples, double peak, double mse) {
-  const int kMaxPSNR = 100;
-  double psnr = kMaxPSNR;
-
-  if (mse > 0.0)
-    psnr = 10.0 * log10(peak * peak * samples / mse);
-
-  if (psnr > kMaxPSNR)
-    psnr = kMaxPSNR;
-
-  return psnr;
-}
diff --git a/vpxstats.h b/vpxstats.h
index 9ce9c5320..5c9ea34f7 100644
--- a/vpxstats.h
+++ b/vpxstats.h
@@ -36,8 +36,6 @@ void stats_close(stats_io_t *stats, int last_pass);
 void stats_write(stats_io_t *stats, const void *pkt, size_t len);
 vpx_fixed_buf_t stats_get(stats_io_t *stats);
 
-double vp8_mse2psnr(double samples, double peak, double mse);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/webmdec.c b/webmdec.c
index 081d57ab8..7cacdf922 100644
--- a/webmdec.c
+++ b/webmdec.c
@@ -12,7 +12,7 @@
 
 #include <stdarg.h>
 
-#include "nestegg/include/nestegg/nestegg.h"
+#include "third_party/nestegg/include/nestegg/nestegg.h"
 
 static int nestegg_read_cb(void *buffer, size_t length, void *userdata) {
   FILE *f = userdata;