64 files changed, 2070 insertions, 604 deletions
diff --git a/README b/README
index a900c8077..49407ed9f 100644
--- a/README
+++ b/README
@@ -76,7 +76,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
-    ppc64-linux-gcc
     ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c4e3b5141..480b2d0ea 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -719,11 +719,8 @@ process_common_toolchain() {
       *sparc*)
         tgt_isa=sparc
         ;;
-      power*64*-*)
-        tgt_isa=ppc64
-        ;;
-      power*)
-        tgt_isa=ppc
+      power*64le*-*)
+        tgt_isa=ppc64le
         ;;
       *mips64el*)
         tgt_isa=mips64
@@ -835,7 +832,7 @@ process_common_toolchain() {
     IOS_VERSION_MIN="8.0"
   else
     IOS_VERSION_OPTIONS=""
-    IOS_VERSION_MIN="6.0"
+    IOS_VERSION_MIN="7.0"
   fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
@@ -1221,7 +1218,7 @@ EOF
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
-    ppc*)
+    ppc64le*)
       link_with_cc=gcc
       setup_gnu_toolchain
       check_gcc_machine_option "vsx"
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 365a8c013..2442a282d 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -132,7 +132,8 @@ create_vpx_framework_config_shim() {
   done
 
   # Consume the last line of output from the loop: We don't want it.
-  sed -i '' -e '$d' "${config_file}"
+  sed -i.bak -e '$d' "${config_file}"
+  rm "${config_file}.bak"
 
   printf "#endif\n\n" >> "${config_file}"
   printf "#endif  // ${include_guard}" >> "${config_file}"
@@ -350,7 +351,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then
   IOS_VERSION_MIN="8.0"
 else
   IOS_VERSION_OPTIONS=""
-  IOS_VERSION_MIN="6.0"
+  IOS_VERSION_MIN="7.0"
 fi
 
 if [ "${VERBOSE}" = "yes" ]; then
diff --git a/configure b/configure
index 702483f13..6c0adc7f9 100755
--- a/configure
+++ b/configure
@@ -116,7 +116,6 @@ all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
@@ -328,6 +327,7 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
+    consistent_recode
     coefficient_range_checking
     vp9_highbitdepth
     better_hw_compatibility
@@ -389,6 +389,7 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
+    consistent_recode
     coefficient_range_checking
     better_hw_compatibility
     vp9_highbitdepth
@@ -521,7 +522,7 @@ process_detect() {
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
         case "${tgt_os}" in
-        linux|os2|darwin*|iphonesimulator*)
+        linux|os2|solaris|darwin*|iphonesimulator*)
             # Supported platforms
             ;;
         *)
@@ -593,6 +594,10 @@ EOF
     check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+
+    if enabled neon && ! enabled external_build; then
+      check_header arm_neon.h || die "Unable to find arm_neon.h"
+    fi
 }
 
 process_toolchain() {
@@ -699,7 +704,7 @@ process_toolchain() {
             soft_enable libyuv
         ;;
         *-android-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
@@ -708,7 +713,7 @@ process_toolchain() {
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
@@ -718,9 +723,7 @@ process_toolchain() {
             check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
 EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
@@ -729,9 +732,7 @@ EOF
             enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
 EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 091c6954d..3fd961bdc 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -622,6 +622,7 @@ int main(int argc, const char **argv) {
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
+  vpx_svc_frame_drop_t svc_drop_frame;
   uint32_t i;
   uint32_t frame_cnt = 0;
   vpx_image_t raw;
@@ -732,6 +733,12 @@ int main(int argc, const char **argv) {
 
   vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
 
+  svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+  for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl)
+    svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh;
+  svc_drop_frame.max_consec_drop = INT_MAX;
+  vpx_codec_control(&codec, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+
   // Encode frames
   while (!end_of_stream) {
     vpx_codec_iter_t iter = NULL;
diff --git a/ivfdec.c b/ivfdec.c
index f64e594ab..3e179bc6e 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        warn("Failed to allocate compressed data buffer");
         frame_size = 0;
       }
     }
@@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      warn("Failed to read full frame");
       return 1;
     }
 
diff --git a/libs.mk b/libs.mk
index 899d380a8..31eb98f1e 100644
--- a/libs.mk
+++ b/libs.mk
@@ -282,18 +282,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
 $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
-libvpx.ver: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)echo "{ global:" > $@
-	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
-	$(qexec)echo "local: *; };" >> $@
-CLEAN-OBJS += libvpx.ver
-
-libvpx.syms: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)awk '{print "_"$$2}' $^ >$@
-CLEAN-OBJS += libvpx.syms
-
 libvpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
@@ -353,6 +341,18 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 endif
 
+libvpx.ver: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo "{ global:" > $@
+	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
+	$(qexec)echo "local: *; };" >> $@
+CLEAN-OBJS += libvpx.ver
+
+libvpx.syms: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)awk '{print "_"$$2}' $^ >$@
+CLEAN-OBJS += libvpx.syms
+
 #
 # Rule to make assembler configuration file from C configuration file
 #
diff --git a/test/bench.cc b/test/bench.cc
new file mode 100644
index 000000000..281b7411d
--- /dev/null
+++ b/test/bench.cc
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <algorithm>
+
+#include "test/bench.h"
+#include "vpx_ports/vpx_timer.h"
+
+void AbstractBench::runNTimes(int n) {
+  for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) {
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int j = 0; j < n; ++j) {
+      run();
+    }
+    vpx_usec_timer_mark(&timer);
+    times[r] = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  }
+}
+
+void AbstractBench::printMedian(const char *title) {
+  std::sort(times, times + VPX_BENCH_ROBUST_ITER);
+  const int med = times[VPX_BENCH_ROBUST_ITER >> 1];
+  int sad = 0;
+  for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) {
+    sad += abs(times[t] - med);
+  }
+  printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0,
+         sad / (VPX_BENCH_ROBUST_ITER * 1000.0));
+}
diff --git a/test/bench.h b/test/bench.h
new file mode 100644
index 000000000..0b0cf10a4
--- /dev/null
+++ b/test/bench.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_BENCH_H_
+#define TEST_BENCH_H_
+
+// Number of iterations used to compute median run time.
+#define VPX_BENCH_ROBUST_ITER 15
+
+class AbstractBench {
+ public:
+  void runNTimes(int n);
+  void printMedian(const char *title);
+
+ protected:
+  // Implement this method and put the code to benchmark in it.
+  virtual void run() = 0;
+
+ private:
+  int times[VPX_BENCH_ROBUST_ITER];
+};
+
+#endif  // TEST_BENCH_H_
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 404b5b44f..34e35b065 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -152,5 +152,5 @@ VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest,
                           ::testing::Values(::libvpx_test::kTwoPassGood,
                                             ::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9));
+                          ::testing::Range(0, 10));
 }  // namespace
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 10062150f..e8ad0cd5d 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -725,4 +725,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
                         ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
                                                      VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo wht_vsx_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(VSX, TransWHT,
+                        ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 0bb435502..142d9e2da 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
   EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
 };
 
-const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 };
 const int kEncodePerfTestThreads[] = { 1, 2, 4 };
 
 #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 63e972a00..b2cbc3f05 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -201,7 +201,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
       PreEncodeFrameHook(video, encoder.get());
       encoder->EncodeFrame(video, frame_flags_);
 
-      PostEncodeFrameHook();
+      PostEncodeFrameHook(encoder.get());
 
       CxDataIterator iter = encoder->GetCxData();
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index a301e21cc..03624d110 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -226,7 +226,7 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
-  virtual void PostEncodeFrameHook() {}
+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
 
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 43a4c6929..9cfaa1f1f 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -124,6 +124,7 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 const DecodeParam kVP8InvalidFileTests[] = {
   { 1, "invalid-bug-1443.ivf" },
   { 1, "invalid-token-partition.ivf" },
+  { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" },
 };
 
 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 5862d2649..4b5d55469 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -16,7 +16,7 @@
 #include "test/video_source.h"
 
 namespace libvpx_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
 const unsigned int kIvfFileHdrSize = 32;
 const unsigned int kIvfFrameHdrSize = 12;
 
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 5a2ade1ef..1fe0348fc 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -11,6 +11,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -32,7 +33,6 @@ typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
                                       int cols, int flimit);
 
 namespace {
-
 // Compute the filter level used in post proc from the loop filter strength
 int q2mbl(int x) {
   if (x < 20) x = 20;
@@ -42,18 +42,36 @@ int q2mbl(int x) {
 }
 
 class VpxPostProcDownAndAcrossMbRowTest
-    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
+  VpxPostProcDownAndAcrossMbRowTest() : mbPostProcDownAndAcross(GetParam()) {}
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  const VpxPostProcDownAndAcrossMbRowFunc mbPostProcDownAndAcross;
+  // Size of the underlying data block that will be filtered.
+  int block_width;
+  int block_height;
+  Buffer<uint8_t> *src_image;
+  Buffer<uint8_t> *dst_image;
+  uint8_t *flimits;
+  void run();
 };
 
+void VpxPostProcDownAndAcrossMbRowTest::run() {
+  mbPostProcDownAndAcross(src_image->TopLeftPixel(), dst_image->TopLeftPixel(),
+                          src_image->stride(), dst_image->stride(), block_width,
+                          flimits, 16);
+}
+
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
-  const int block_width = 16;
-  const int block_height = 16;
+  block_width = 16;
+  block_height = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
   Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
@@ -66,8 +84,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
       Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
   ASSERT_TRUE(dst_image.Init());
 
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
   (void)memset(flimits, 255, block_width);
 
   // Initialize pixels in the input:
@@ -79,13 +96,12 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Initialize pixels in the output to 99.
   dst_image.Set(99);
 
-  ASM_REGISTER_STATE_CHECK(GetParam()(
+  ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross(
       src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
       dst_image.stride(), block_width, flimits, 16));
 
-  static const uint8_t kExpectedOutput[block_height] = {
-    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
-  };
+  static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 3, 4 };
 
   uint8_t *pixel_ptr = dst_image.TopLeftPixel();
   for (int i = 0; i < block_height; ++i) {
@@ -103,8 +119,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
   // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
   // blocks are always a multiple of 8 wide and exactly 8 high.
-  const int block_width = 136;
-  const int block_height = 16;
+  block_width = 136;
+  block_height = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
@@ -127,8 +143,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
   // it must be padded out.
   const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
 
   ACMRandom rnd;
   rnd.Reset(ACMRandom::DeterministicSeed());
@@ -143,7 +158,6 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
 
     for (int f = 0; f < 255; f++) {
       (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
-
       dst_image.Set(0);
       dst_image_ref.Set(0);
 
@@ -151,10 +165,10 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
           src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
           src_image.stride(), dst_image_ref.stride(), block_width, flimits,
           block_height);
-      ASM_REGISTER_STATE_CHECK(
-          GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
-                     src_image.stride(), dst_image.stride(), block_width,
-                     flimits, block_height));
+      ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross(
+          src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+          src_image.stride(), dst_image.stride(), block_width, flimits,
+          block_height));
 
       ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
     }
@@ -163,12 +177,58 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   vpx_free(flimits);
 }
 
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
+  // Size of the underlying data block that will be filtered.
+  block_width = 16;
+  block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());
+  this->src_image = &src_image;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
+  this->dst_image = &dst_image;
+
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  (void)memset(flimits, 255, block_width);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(1);
+
+  // Initialize pixels in the output to 99.
+  dst_image.Set(99);
+
+  runNTimes(INT16_MAX);
+  printMedian("16x16");
+
+  vpx_free(flimits);
+};
+
 class VpxMbPostProcAcrossIpTest
-    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
  public:
+  VpxMbPostProcAcrossIpTest()
+      : rows(16), cols(16), mbPostProcAcrossIp(GetParam()),
+        src(Buffer<uint8_t>(rows, cols, 8, 8, 17, 8)) {}
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
+  const int rows;
+  const int cols;
+  const VpxMbPostProcAcrossIpFunc mbPostProcAcrossIp;
+  Buffer<uint8_t> src;
+  void run();
+
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
@@ -197,11 +257,11 @@ class VpxMbPostProcAcrossIpTest
   }
 };
 
-TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+void VpxMbPostProcAcrossIpTest::run() {
+  mbPostProcAcrossIp(src.TopLeftPixel(), src.stride(), rows, cols, q2mbl(0));
+}
 
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
   ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
@@ -215,15 +275,11 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
   ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
 
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13
   };
 
@@ -232,15 +288,11 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
   ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
 
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
   };
 
@@ -254,9 +306,6 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-
   Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
   ASSERT_TRUE(c_mem.Init());
   Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
@@ -279,12 +328,33 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
   }
 }
 
+TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) {
+  ASSERT_TRUE(src.Init());
+  src.SetPadding(10);
+
+  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+
+  runNTimes(100000);
+  printMedian("16x16");
+}
+
 class VpxMbPostProcDownTest
-    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
  public:
+  VpxMbPostProcDownTest()
+      : rows(16), cols(16), mbPostProcDown(GetParam()),
+        src_c(Buffer<uint8_t>(rows, cols, 8, 8, 8, 17)) {}
+
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
+  const int rows;
+  const int cols;
+  const VpxMbPostProcDownFunc mbPostProcDown;
+  Buffer<uint8_t> src_c;
+  void run();
+
   void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       memset(src_c, r, cols);
@@ -306,22 +376,22 @@ class VpxMbPostProcDownTest
   void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
                       int filter_level, const unsigned char *expected_output) {
     ASM_REGISTER_STATE_CHECK(
-        GetParam()(s, src_width, rows, cols, filter_level));
+        mbPostProcDown(s, src_width, rows, cols, filter_level));
     RunComparison(expected_output, s, rows, cols, src_width);
   }
 };
 
-TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+void VpxMbPostProcDownTest::run() {
+  mbPostProcDown(src_c.TopLeftPixel(), src_c.stride(), rows, cols, q2mbl(0));
+}
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
   ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
   SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
 
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
@@ -348,16 +418,12 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
   ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
   SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
 
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,
     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
@@ -379,10 +445,6 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
   ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
@@ -399,13 +461,9 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-
   ACMRandom rnd;
   rnd.Reset(ACMRandom::DeterministicSeed());
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
   ASSERT_TRUE(src_c.Init());
   Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
   ASSERT_TRUE(src_asm.Init());
@@ -418,7 +476,7 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
 
     vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
                            q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(GetParam()(
+    ASM_REGISTER_STATE_CHECK(mbPostProcDown(
         src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
     ASSERT_TRUE(src_asm.CheckValues(src_c));
 
@@ -429,12 +487,22 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
 
     vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
                            q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(GetParam()(
+    ASM_REGISTER_STATE_CHECK(mbPostProcDown(
         src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
     ASSERT_TRUE(src_asm.CheckValues(src_c));
   }
 }
 
+TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) {
+  ASSERT_TRUE(src_c.Init());
+  src_c.SetPadding(10);
+
+  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+
+  runNTimes(100000);
+  printMedian("16x16");
+}
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
@@ -481,4 +549,16 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
                         ::testing::Values(vpx_mbpost_proc_down_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx));
+
+INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_vsx));
+
+INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_vsx));
+#endif  // HAVE_VSX
+
 }  // namespace
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index be3a1969c..b9fbd8f4f 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -115,6 +115,8 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
     key_frame_spacing_ = 9999;
     num_nonref_frames_ = 0;
     layer_framedrop_ = 0;
+    force_key_ = 0;
+    force_key_test_ = 0;
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
 
@@ -203,6 +205,7 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
         svc_drop_frame.framedrop_mode = LAYER_DROP;
         for (i = 0; i < number_spatial_layers_; i++)
           svc_drop_frame.framedrop_thresh[i] = 30;
+        svc_drop_frame.max_consec_drop = 30;
         encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
       }
     }
@@ -268,7 +271,7 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
     }
 
     if (dynamic_drop_layer_) {
-      if (video->frame() == 50) {
+      if (video->frame() == 0) {
         // Change layer bitrates to set top layers to 0. This will trigger skip
         // encoding/dropping of top two spatial layers.
         cfg_.rc_target_bitrate -=
@@ -278,7 +281,25 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
         cfg_.layer_target_bitrate[1] = 0;
         cfg_.layer_target_bitrate[2] = 0;
         encoder->Config(&cfg_);
+      } else if (video->frame() == 50) {
+        // Change layer bitrates to non-zero on two top spatial layers.
+        // This will trigger skip encoding of top two spatial layers.
+        cfg_.layer_target_bitrate[1] = middle_bitrate_;
+        cfg_.layer_target_bitrate[2] = top_bitrate_;
+        cfg_.rc_target_bitrate +=
+            cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1];
+        encoder->Config(&cfg_);
       } else if (video->frame() == 100) {
+        // Change layer bitrates to set top layers to 0. This will trigger skip
+        // encoding/dropping of top two spatial layers.
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 150) {
         // Change layer bitrate on second layer to non-zero to start
         // encoding it again.
         cfg_.layer_target_bitrate[1] = middle_bitrate_;
@@ -292,12 +313,21 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
         encoder->Config(&cfg_);
       }
     }
+
+    if (force_key_test_ && force_key_)
+      frame_flags_ = VPX_EFLAG_FORCE_KF;
+    else
+      frame_flags_ = 0;
+
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
   }
 
-  virtual void PostEncodeFrameHook() {
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    vpx_svc_layer_id_t layer_id;
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
       for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
         const int layer = sl * number_temporal_layers_ + tl;
@@ -366,13 +396,19 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
     // In the constrained frame drop mode, if a given spatial is dropped all
     // upper layers must be dropped too.
     if (!layer_framedrop_) {
+      int num_layers_dropped = 0;
       for (int sl = 0; sl < number_spatial_layers_; ++sl) {
         if (!pkt->data.frame.spatial_layer_encoded[sl]) {
           // Check that all upper layers are dropped.
+          num_layers_dropped++;
           for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2)
             ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0);
         }
       }
+      if (num_layers_dropped == number_spatial_layers_ - 1)
+        force_key_ = 1;
+      else
+        force_key_ = 0;
     }
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
@@ -461,6 +497,8 @@ class DatarateOnePassCbrSvc : public ::libvpx_test::EncoderTest {
   int key_frame_spacing_;
   unsigned int num_nonref_frames_;
   int layer_framedrop_;
+  int force_key_;
+  int force_key_test_;
 };
 
 // Params: speed setting.
@@ -528,6 +566,53 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) {
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers, with force key frame after frame drop
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 100;
+  ResetModel();
+  AssignLayerBitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                      cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                      layer_target_avg_bandwidth_, bits_in_buffer_model_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(num_nonref_frames_, GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
 // 3 temporal layers. Run CIF clip with 1 thread.
 TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL) {
   cfg_.rc_buf_initial_sz = 500;
@@ -711,9 +796,9 @@ TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) {
                       cfg_.ts_number_layers, cfg_.temporal_layering_mode,
                       layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // Don't check rate targeting on top spatial layer since it will be skipped
-  // for part of the sequence.
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 1,
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_ - 2,
                           number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
@@ -848,7 +933,7 @@ TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) {
                       layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.75, 1.2);
+                          number_temporal_layers_, file_datarate_, 0.75, 1.45);
 #if CONFIG_VP9_DECODER
   // The non-reference frames are expected to be mismatched frames as the
   // encoder will avoid loopfilter on these frames.
@@ -1147,20 +1232,21 @@ TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) {
 }
 
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSingleBR,
-                          ::testing::Range(5, 9));
+                          ::testing::Range(5, 10));
 
-VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 9),
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 10),
                           ::testing::Range(0, 3));
 
 VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcFrameDropMultiBR,
-                          ::testing::Range(5, 9), ::testing::Range(0, 2),
+                          ::testing::Range(5, 10), ::testing::Range(0, 2),
                           ::testing::Range(0, 3));
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser, ::testing::Range(5, 9),
-                          ::testing::Range(1, 3), ::testing::Range(0, 3));
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser,
+                          ::testing::Range(5, 10), ::testing::Range(1, 3),
+                          ::testing::Range(0, 3));
 #endif
 
-VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 9),
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 10),
                           ::testing::Range(32, 36));
 }  // namespace
diff --git a/test/test-data.mk b/test/test-data.mk
index 7ca11bc9c..4be6c66ff 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -738,6 +738,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 3a23ff5db..9cb9d5864 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -856,3 +856,5 @@ fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
 90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
+a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
diff --git a/test/test.mk b/test/test.mk
index 3e5739e21..224ac4e8f 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,4 +1,6 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += bench.h
+LIBVPX_TEST_SRCS-yes += bench.cc
 LIBVPX_TEST_SRCS-yes += buffer.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
 LIBVPX_TEST_SRCS-yes += codec_factory.h
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 725821ae6..fce7a1475 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1533,6 +1533,27 @@ INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest,
 INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_vsx)));
+INSTANTIATE_TEST_CASE_P(VSX, VpxMseTest,
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx),
+                                          MseParams(4, 3, &vpx_mse16x8_vsx),
+                                          MseParams(3, 4, &vpx_mse8x16_vsx),
+                                          MseParams(3, 3, &vpx_mse8x8_vsx)));
+
+INSTANTIATE_TEST_CASE_P(
+    VSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx),
+                      VarianceParams(6, 5, &vpx_variance64x32_vsx),
+                      VarianceParams(5, 6, &vpx_variance32x64_vsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_vsx),
+                      VarianceParams(5, 4, &vpx_variance32x16_vsx),
+                      VarianceParams(4, 5, &vpx_variance16x32_vsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_vsx),
+                      VarianceParams(4, 3, &vpx_variance16x8_vsx),
+                      VarianceParams(3, 4, &vpx_variance8x16_vsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_vsx),
+                      VarianceParams(3, 2, &vpx_variance8x4_vsx),
+                      VarianceParams(2, 3, &vpx_variance4x8_vsx),
+                      VarianceParams(2, 2, &vpx_variance4x4_vsx)));
 #endif  // HAVE_VSX
 
 #if HAVE_MMI
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index c4dbcacbe..a8bcc2a43 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -266,7 +266,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
       << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36)
       << " The datarate for the file is greater than target by too much!";
 }
 
@@ -294,7 +294,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
       << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
       << " The datarate for the file is greater than target by too much!";
 }
 
@@ -824,16 +824,17 @@ TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 9), ::testing::Range(0, 4));
+                          ::testing::Range(2, 10), ::testing::Range(0, 4));
 
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeOneBR,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 9));
+                          ::testing::Range(2, 10));
 
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 9));
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, ::testing::Range(5, 9));
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
+                          ::testing::Range(5, 10));
 #endif
 }  // namespace
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 6b7e51211..44659904f 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -409,7 +409,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(::libvpx_test::kTwoPassGood,
                           ::libvpx_test::kOnePassGood,
                           ::libvpx_test::kRealTime),
-        ::testing::Range(3, 9),    // cpu_used
+        ::testing::Range(3, 10),   // cpu_used
         ::testing::Range(0, 3),    // tile_columns
         ::testing::Range(2, 5)));  // threads
 
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index f0bbedbfa..c39267faa 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -18,6 +18,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -67,10 +68,13 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
      scan, iscan);
 }
 
-class VP9QuantizeBase {
+class VP9QuantizeBase : public AbstractBench {
  public:
   VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
-      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp),
+        coeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 16)),
+        qcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)),
+        dqcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)) {
     max_value_ = (1 << bit_depth_) - 1;
     zbin_ptr_ =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
@@ -86,6 +90,9 @@ class VP9QuantizeBase {
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
+
+    r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
   ~VP9QuantizeBase() {
@@ -118,6 +125,15 @@ class VP9QuantizeBase {
   int max_value_;
   const int max_size_;
   const bool is_fp_;
+  Buffer<tran_low_t> coeff;
+  Buffer<tran_low_t> qcoeff;
+  Buffer<tran_low_t> dqcoeff;
+  int16_t *r_ptr;
+  int16_t *q_ptr;
+  int count;
+  int skip_block;
+  const scan_order *scan;
+  uint16_t eob;
 };
 
 class VP9QuantizeTest : public VP9QuantizeBase,
@@ -128,10 +144,17 @@ class VP9QuantizeTest : public VP9QuantizeBase,
         quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
+  void run();
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
 
+void VP9QuantizeTest::run() {
+  quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+               quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+               dequant_ptr_, &eob, scan->scan, scan->iscan);
+}
+
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
@@ -269,11 +292,8 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
 
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
   ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(dqcoeff.Init());
   Buffer<tran_low_t> ref_qcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
@@ -281,7 +301,8 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
   Buffer<tran_low_t> ref_dqcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_dqcoeff.Init());
-  uint16_t eob, ref_eob;
+  uint16_t ref_eob = 0;
+  eob = 0;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     // Test skip block for the first three iterations to catch all the different
@@ -294,23 +315,21 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
       sz = TX_32X32;
     }
     const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);
+    scan = &vp9_scan_orders[sz][tx_type];
+    count = (4 << sz) * (4 << sz);
     coeff.Set(&rnd, -max_value_, max_value_);
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
     ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
                      q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
                      ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+                     scan->scan, scan->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
         coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
         quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+        dequant_ptr_, &eob, scan->scan, scan->iscan));
 
     EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -328,11 +347,8 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
 
 TEST_P(VP9QuantizeTest, EOBCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
   ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(dqcoeff.Init());
   Buffer<tran_low_t> ref_qcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
@@ -340,10 +356,12 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   Buffer<tran_low_t> ref_dqcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_dqcoeff.Init());
-  uint16_t eob, ref_eob;
+  uint16_t ref_eob = 0;
+  eob = 0;
+  const uint32_t max_index = max_size_ * max_size_ - 1;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = 0;
+    skip_block = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -351,28 +369,26 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
       sz = TX_32X32;
     }
     const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);
+    scan = &vp9_scan_orders[sz][tx_type];
+    count = (4 << sz) * (4 << sz);
     // Two random entries
     coeff.Set(0);
-    coeff.TopLeftPixel()[rnd(count)] =
+    coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] =
         static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
-    coeff.TopLeftPixel()[rnd(count)] =
+    coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] =
         static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
     ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
                      q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
                      ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+                     scan->scan, scan->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
         coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
         quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+        dequant_ptr_, &eob, scan->scan, scan->iscan));
 
     EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -390,13 +406,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
 
 TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
   ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(dqcoeff.Init());
-  uint16_t eob;
   TX_SIZE starting_sz, ending_sz;
 
   if (max_size_ == 16) {
@@ -410,18 +422,16 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
     // zbin > coeff, zbin < coeff.
     for (int i = 0; i < 2; ++i) {
-      const int skip_block = 0;
+      skip_block = 0;
       // TX_TYPE defines the scan order. That is not relevant to the speed test.
       // Pick the first one.
       const TX_TYPE tx_type = DCT_DCT;
-      const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-      const int count = (4 << sz) * (4 << sz);
+      count = (4 << sz) * (4 << sz);
+      scan = &vp9_scan_orders[sz][tx_type];
 
       GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                            quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                            quant_fp_ptr_);
-      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
 
       if (i == 0) {
         // When |coeff values| are less than zbin the results are 0.
@@ -438,22 +448,15 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
         coeff.Set(&rnd, -500, 500);
       }
 
-      vpx_usec_timer timer;
-      vpx_usec_timer_start(&timer);
-      for (int j = 0; j < 100000000 / count; ++j) {
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
-                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
-                     scan_order->scan, scan_order->iscan);
-      }
-      vpx_usec_timer_mark(&timer);
-      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-      if (i == 0) printf("Bypass calculations.\n");
-      if (i == 1) printf("Full calculations.\n");
-      printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
-             elapsed_time / 1000);
+      runNTimes(10000000 / count);
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+      printMedian(title);
     }
-    printf("\n");
   }
 }
 
@@ -557,6 +560,16 @@ INSTANTIATE_TEST_CASE_P(
                                  VPX_BITS_8, 32, true)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_vsx,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false),
+                                          make_tuple(&vpx_quantize_b_32x32_vsx,
+                                                     &vpx_quantize_b_32x32_c,
+                                                     VPX_BITS_8, 32, false)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_CASE_P(
     DISABLED_C, VP9QuantizeTest,
@@ -575,10 +588,3 @@ INSTANTIATE_TEST_CASE_P(
                    &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
                    true)));
 }  // namespace
-
-#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_vsx,
-                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
-#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index 8149a083f..b46ba101d 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index ebb5ff2f4..6d8b0b4cc 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74
+Version: af81f26025b7435fa9a14ad07c58b44cf9280430
 License: BSD
 License File: LICENSE.txt
 
@@ -7,4 +7,14 @@ Description:
 libwebm is used to handle WebM container I/O.
 
 Local Changes:
-* <none>
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/third_party/libwebm/common/file_util.cc b/third_party/libwebm/common/file_util.cc
index 6dab146dd..618ffc087 100644
--- a/third_party/libwebm/common/file_util.cc
+++ b/third_party/libwebm/common/file_util.cc
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <fstream>
 #include <ios>
+#include <string>
 
 namespace libwebm {
 
@@ -41,7 +42,12 @@ std::string GetTempFileName() {
   return temp_file_name;
 #else
   char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
   errno_t err = tmpnam_s(tmp_file_name);
+#else
+  char* fname_pointer = tmpnam(tmp_file_name);
+  errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
   if (err == 0) {
     return std::string(tmp_file_name);
   }
@@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) {
   return file_size;
 }
 
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+  std::ifstream file(file_name.c_str());
+  *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+  if (file.good() && contents->size()) {
+    file.read(&(*contents)[0], contents->size());
+  }
+  return !file.fail();
+}
+
 TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
 
 TempFileDeleter::~TempFileDeleter() {
diff --git a/third_party/libwebm/common/file_util.h b/third_party/libwebm/common/file_util.h
index 0e71eac11..a87373464 100644
--- a/third_party/libwebm/common/file_util.h
+++ b/third_party/libwebm/common/file_util.h
@@ -22,6 +22,9 @@ std::string GetTempFileName();
 // Returns size of file specified by |file_name|, or 0 upon failure.
 uint64_t GetFileSize(const std::string& file_name);
 
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
 // Manages life of temporary file specified at time of construction. Deletes
 // file upon destruction.
 class TempFileDeleter {
@@ -38,4 +41,4 @@ class TempFileDeleter {
 
 }  // namespace libwebm
 
-#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
-\ No newline at end of file
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/third_party/libwebm/common/hdr_util.cc b/third_party/libwebm/common/hdr_util.cc
index e1618ce75..916f7170b 100644
--- a/third_party/libwebm/common/hdr_util.cc
+++ b/third_party/libwebm/common/hdr_util.cc
@@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
     muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
-  PrimaryChromaticityPtr r_ptr(NULL);
-  PrimaryChromaticityPtr g_ptr(NULL);
-  PrimaryChromaticityPtr b_ptr(NULL);
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
 
   if (parser_mm.r) {
     if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
diff --git a/third_party/libwebm/common/hdr_util.h b/third_party/libwebm/common/hdr_util.h
index 3ef5388fd..78e2eeb70 100644
--- a/third_party/libwebm/common/hdr_util.h
+++ b/third_party/libwebm/common/hdr_util.h
@@ -47,15 +47,7 @@ struct Vp9CodecFeatures {
   int chroma_subsampling;
 };
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic pop
-#endif
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 15b9a908d..481771db2 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvmuxer.h"
 
+#include <stdint.h>
+
 #include <cfloat>
 #include <climits>
 #include <cstdio>
@@ -24,11 +26,6 @@
 #include "mkvmuxer/mkvwriter.h"
 #include "mkvparser/mkvparser.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvmuxer {
 
 const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -72,7 +69,7 @@ bool StrCpy(const char* src, char** dst_ptr) {
   return true;
 }
 
-typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
 bool CopyChromaticity(const PrimaryChromaticity* src,
                       PrimaryChromaticityPtr* dst) {
   if (!dst)
@@ -1057,22 +1054,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 bool MasteringMetadata::SetChromaticity(
     const PrimaryChromaticity* r, const PrimaryChromaticity* g,
     const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
-  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
   if (r) {
     if (!CopyChromaticity(r, &r_ptr))
       return false;
   }
-  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(nullptr);
   if (g) {
     if (!CopyChromaticity(g, &g_ptr))
       return false;
   }
-  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(nullptr);
   if (b) {
     if (!CopyChromaticity(b, &b_ptr))
       return false;
   }
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
   if (white_point) {
     if (!CopyChromaticity(white_point, &wp_ptr))
       return false;
@@ -1238,7 +1235,7 @@ bool Colour::Write(IMkvWriter* writer) const {
 }
 
 bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -1546,7 +1543,7 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
 }
 
 bool VideoTrack::SetColour(const Colour& colour) {
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -1574,7 +1571,7 @@ bool VideoTrack::SetColour(const Colour& colour) {
 }
 
 bool VideoTrack::SetProjection(const Projection& projection) {
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -2666,7 +2663,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
   // and write it if it is okay to do so (i.e.) no other track has an held back
   // frame with timestamp <= the timestamp of the frame in question.
   std::vector<std::list<Frame*>::iterator> frames_to_erase;
-  for (std::list<Frame *>::iterator
+  for (std::list<Frame*>::iterator
            current_track_iterator = stored_frames_[track_number].begin(),
            end = --stored_frames_[track_number].end();
        current_track_iterator != end; ++current_track_iterator) {
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index 37f230d0a..e7b76f7da 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,12 +22,8 @@
 
 #include "common/webmids.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
 const float Projection::kValueNotPresent = FLT_MAX;
@@ -330,7 +326,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size,
   delete[] str;
   str = NULL;
 
-  if (size >= LONG_MAX || size < 0)
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
     return E_FILE_FORMAT_INVALID;
 
   // +1 for '\0' terminator
@@ -5015,7 +5011,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
   if (!reader || *mm)
     return false;
 
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -5035,6 +5031,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_max = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
           mm_ptr->luminance_max > 9999.99) {
@@ -5044,6 +5044,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_min = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
           mm_ptr->luminance_min > 999.9999) {
@@ -5096,7 +5100,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   if (!reader || *colour)
     return false;
 
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -5194,7 +5198,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
   if (!reader || *projection)
     return false;
 
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -7903,6 +7907,10 @@ long Block::Parse(const Cluster* pCluster) {
         return E_FILE_FORMAT_INVALID;
 
       curr.len = static_cast<long>(frame_size);
+      // Check if size + curr.len could overflow.
+      if (size > LLONG_MAX - curr.len) {
+        return E_FILE_FORMAT_INVALID;
+      }
       size += curr.len;  // contribution of this frame
 
       --frame_count;
@@ -7964,6 +7972,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const {
   const long long tc0 = pCluster->GetTimeCode();
   assert(tc0 >= 0);
 
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
   const long long tc = tc0 + m_timecode;
 
   return tc;  // unscaled timecode units
@@ -7981,6 +7994,10 @@ long long Block::GetTime(const Cluster* pCluster) const {
   const long long scale = pInfo->GetTimeCodeScale();
   assert(scale >= 1);
 
+  // Check if tc * scale could overflow.
+  if (tc != 0 && scale > LLONG_MAX / tc) {
+    return -1;
+  }
   const long long ns = tc * scale;
 
   return ns;
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
index 5e8ca02b4..6c1d784d3 100644
--- a/tools/tiny_ssim.c
+++ b/tools/tiny_ssim.c
@@ -91,6 +91,7 @@ typedef struct input_file {
   int w;
   int h;
   int bit_depth;
+  int frame_size;
 } input_file_t;
 
 // Open a file and determine if its y4m or raw.  If y4m get the header.
@@ -119,10 +120,12 @@ static int open_input_file(const char *file_name, input_file_t *input, int w,
         fseek(input->file, 0, SEEK_SET);
         input->w = w;
         input->h = h;
-        if (bit_depth < 9)
-          input->buf = malloc(w * h * 3 / 2);
-        else
-          input->buf = malloc(w * h * 3);
+        // handle odd frame sizes
+        input->frame_size = w * h + ((w + 1) / 2 * (h + 1) / 2) * 2;
+        if (bit_depth > 8) {
+          input->frame_size *= 2;
+          input->buf = malloc(input->frame_size);
+        }
         break;
     }
   }
@@ -150,15 +153,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y,
       break;
     case RAW_YUV:
       if (bd < 9) {
-        r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
         *y = in->buf;
         *u = in->buf + in->w * in->h;
-        *v = in->buf + 5 * in->w * in->h / 4;
+        *v = *u + (1 + in->w) / 2 * (1 + in->h) / 2;
       } else {
-        r1 = fread(in->buf, in->w * in->h * 3, 1, in->file);
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
         *y = in->buf;
-        *u = in->buf + in->w * in->h / 2;
-        *v = *u + in->w * in->h / 2;
+        *u = in->buf + (in->w * in->h) * 2;
+        *v = *u + 2 * ((1 + in->w) / 2 * (1 + in->h) / 2);
       }
       break;
   }
@@ -325,7 +328,8 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
 //    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
 //
 // Replace c1 with n*n * c1 for the final step that leads to this code:
-// The final step scales by 12 bits so we don't lose precision in the constants.
+// The final step scales by 12 bits so we don't lose precision in the
+// constants.
 
 static double ssimv_similarity(const Ssimv *sv, int64_t n) {
   // Scale the constants by number of pixels.
@@ -628,9 +632,10 @@ int main(int argc, char *argv[]) {
     goto clean_up;
   }
 
-  // Number of frames to skip from file1.yuv for every frame used. Normal values
-  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
-  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+  // Number of frames to skip from file1.yuv for every frame used. Normal
+  // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL
+  // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer
+  // encoding.
   if (argc > 4) {
     sscanf(argv[4], "%d", &tl_skip);
     if (argc > 5) {
@@ -644,12 +649,6 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  if (w & 1 || h & 1) {
-    fprintf(stderr, "Invalid size %dx%d\n", w, h);
-    return_value = 1;
-    goto clean_up;
-  }
-
   while (1) {
     size_t r1, r2;
     unsigned char *y[2], *u[2], *v[2];
@@ -703,8 +702,10 @@ int main(int argc, char *argv[]) {
       psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
     }
     psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
-    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2);
-    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2);
+    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,
+                  (h + 1) / 2);
+    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2,
+                  (h + 1) / 2);
 
     n_frames++;
   }
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index d67ee8a57..8c292d616 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -65,7 +65,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
   int ppl = (int)(level + .5);
 
-  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+  const MODE_INFO *mode_info_context = cm->mi;
   int mbr, mbc;
 
   /* The pixel thresholds are adjusted according to if or not the macroblock
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 8bfd3cea3..0d54a9442 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -686,6 +686,12 @@ static unsigned int read_available_partition_size(
   const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
   unsigned int partition_size = 0;
   ptrdiff_t bytes_left = fragment_end - fragment_start;
+  if (bytes_left < 0) {
+    vpx_internal_error(
+        &pc->error, VPX_CODEC_CORRUPT_FRAME,
+        "Truncated packet or corrupt partition. No bytes left %d.",
+        (int)bytes_left);
+  }
   /* Calculate the length of this partition. The last partition
    * size is implicit. If the partition size can't be read, then
    * either use the remaining data in the buffer (for EC mode)
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index b47840795..3b4fee7cf 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -428,7 +428,9 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   double weight_segment = 0;
   int thresh_low_motion = (cm->width < 720) ? 55 : 20;
   cr->apply_cyclic_refresh = 1;
-  if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+  // TODO(jianj): Look into issue of cyclic refresh with high bitdepth.
+  if (cm->bit_depth > 8 || cm->frame_type == KEY_FRAME ||
+      cpi->svc.temporal_layer_id > 0 ||
       (cpi->use_svc &&
        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
       (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
@@ -457,6 +459,15 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       cr->rate_boost_fac = 13;
     }
   }
+  // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and
+  // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2
+  // (rate_boost_fac = 10 disables segment#2).
+  // TODO(marpan): Consider increasing refresh rate after slide change.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 2.0;
+    cr->rate_boost_fac = 10;
+  }
   // Adjust some parameters for low resolutions.
   if (cm->width <= 352 && cm->height <= 288) {
     if (rc->avg_frame_bandwidth < 3000) {
@@ -587,3 +598,12 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
 }
+
+void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) {
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // For now apply hard limit to frame-level decrease in q, if the cyclic
+  // refresh is active (percent_refresh > 0).
+  if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) {
+    *q = cpi->rc.q_1_frame - 8;
+  }
+}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 77fa67c9e..f59f193f6 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -139,6 +139,8 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
     return CR_SEGMENT_ID_BASE;
 }
 
+void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index c96dc3fbd..0b3eef7b3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -556,6 +556,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
     } else {
       thresholds[1] = (5 * threshold_base) >> 1;
     }
+    if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX;
   }
 }
 
@@ -4877,6 +4878,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
+#if CONFIG_CONSISTENT_RECODE
+            tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
+#endif
             tile_data->mode_map[i][j] = j;
           }
         }
@@ -5001,7 +5005,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-
+#if CONFIG_CONSISTENT_RECODE
+  x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
+#endif
   if (xd->lossless) x->optimize = 0;
 
   cm->tx_mode = select_tx_mode(cpi, xd);
@@ -5126,9 +5132,48 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
+#if CONFIG_CONSISTENT_RECODE
+static void restore_encode_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  int i, j;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes[i][j] =
+          rd_opt->prediction_type_threshes_prev[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
+  }
+
+  if (cpi->tile_data != NULL) {
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] =
+                tile_data->thresh_freq_fact_prev[i][j];
+          }
+        }
+      }
+  }
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+}
+#endif
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
+#if CONFIG_CONSISTENT_RECODE
+  restore_encode_params(cpi);
+#endif
+
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
   // different sign bias and that buffer is then the fixed ref. However, this
@@ -5404,7 +5449,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
     if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
-    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0)
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
       update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
   }
 }
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3384de7ea..fca8f331d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3024,23 +3024,28 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     SVC *const svc = &cpi->svc;
     if (cm->frame_type == KEY_FRAME) {
       int i;
-      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
       // On key frame update all reference frame slots.
       for (i = 0; i < REF_FRAMES; i++) {
+        svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+        svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
         // LAST/GOLDEN/ALTREF is already updated above.
         if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx &&
             i != cpi->alt_fb_idx)
           ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
       }
     } else {
-      if (cpi->refresh_last_frame)
-        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_golden_frame)
-        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_alt_ref_frame)
-        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+      if (cpi->refresh_last_frame) {
+        svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id;
+        svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id;
+      }
+      if (cpi->refresh_golden_frame) {
+        svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id;
+        svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id;
+      }
+      if (cpi->refresh_alt_ref_frame) {
+        svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id;
+        svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id;
+      }
     }
     // Copy flags from encoder to SVC struct.
     vp9_copy_flags_ref_update_idx(cpi);
@@ -3574,8 +3579,41 @@ static void set_frame_size(VP9_COMP *cpi) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
-                                       uint8_t *dest) {
+#if CONFIG_CONSISTENT_RECODE
+static void save_encode_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  int i, j;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes_prev[i][j] =
+          rd_opt->prediction_type_threshes[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
+  }
+
+  if (cpi->tile_data != NULL) {
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact_prev[i][j] =
+                tile_data->thresh_freq_fact[i][j];
+          }
+        }
+      }
+  }
+}
+#endif
+
+static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
+                                      uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
   const INTERP_FILTER filter_scaler =
@@ -3686,12 +3724,23 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc)))
+       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
     vp9_scene_detection_onepass(cpi);
 
   if (cpi->svc.spatial_layer_id == 0)
     cpi->svc.high_source_sad_superframe = cpi->rc.high_source_sad;
 
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame, or if base layer is key for svc.
+  // Don't drop on scene change.
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+      cm->frame_type != KEY_FRAME && !cpi->rc.high_source_sad &&
+      !cpi->svc.high_source_sad_superframe &&
+      (!cpi->use_svc ||
+       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi)) return 0;
+  }
+
   // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
@@ -3715,7 +3764,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     }
   }
 
-  if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
+  // TODO(jianj): Look into issue of skin detection with high bitdepth.
+  if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
@@ -3729,10 +3779,12 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   suppress_active_map(cpi);
 
-  // For SVC on non-zero spatial layer: check for disabling inter-layer
-  // prediction.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0)
-    vp9_svc_constrain_inter_layer_pred(cpi);
+  if (cpi->use_svc) {
+    // On non-zero spatial layer, check for disabling inter-layer
+    // prediction.
+    if (cpi->svc.spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
+    vp9_svc_assert_constraints_pattern(cpi);
+  }
 
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
@@ -3795,6 +3847,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
   vpx_clear_system_state();
+  return 1;
 }
 
 #define MAX_QSTEP_ADJ 4
@@ -4485,11 +4538,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
       cpi->oxcf.target_bandwidth == 0) {
     cpi->svc.skip_enhancement_layer = 1;
     vp9_rc_postencode_update_drop_frame(cpi);
-    vp9_inc_frame_in_layer(cpi);
     cpi->ext_refresh_frame_flags_pending = 0;
     cpi->last_frame_dropped = 1;
     cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
     cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    if (cpi->svc.framedrop_mode == LAYER_DROP ||
+        cpi->svc.drop_spatial_layer[0] == 0) {
+      // For the case of constrained drop mode where the base is dropped
+      // (drop_spatial_layer[0] == 1), which means full superframe dropped,
+      // we don't increment the svc frame counters. In particular temporal
+      // layer counter (which is incremented in vp9_inc_frame_in_layer())
+      // won't be incremented, so on a dropped frame we try the same
+      // temporal_layer_id on next incoming frame. This is to avoid an
+      // issue with temporal alignement with full superframe dropping.
+      vp9_inc_frame_in_layer(cpi);
+    }
     return;
   }
 
@@ -4538,55 +4601,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
     }
   }
 
-  // For 1 pass CBR, check if we are dropping this frame.
-  // Never drop on key frame, or if base layer is key for svc.
-  if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR &&
-      cm->frame_type != KEY_FRAME &&
-      (!cpi->use_svc ||
-       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
-    int svc_prev_layer_dropped = 0;
-    // In the contrained framedrop mode for svc (framedrop_mode =
-    // CONSTRAINED_LAYER_DROP), if the previous spatial layer was dropped, drop
-    // the current spatial layer.
-    if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-        cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1])
-      svc_prev_layer_dropped = 1;
-    if ((svc_prev_layer_dropped &&
-         cpi->svc.framedrop_mode == CONSTRAINED_LAYER_DROP) ||
-        vp9_rc_drop_frame(cpi)) {
-      vp9_rc_postencode_update_drop_frame(cpi);
-      cpi->ext_refresh_frame_flags_pending = 0;
-      cpi->last_frame_dropped = 1;
-      if (cpi->use_svc) {
-        cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
-        cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
-        vp9_inc_frame_in_layer(cpi);
-        cpi->svc.skip_enhancement_layer = 1;
-        if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
-          int i;
-          int all_layers_drop = 1;
-          for (i = 0; i < cpi->svc.spatial_layer_id; i++) {
-            if (cpi->svc.drop_spatial_layer[i] == 0) {
-              all_layers_drop = 0;
-              break;
-            }
-          }
-          if (all_layers_drop == 1) cpi->svc.skip_enhancement_layer = 0;
-        }
-      }
-      return;
-    }
-  }
-
   vpx_clear_system_state();
 
 #if CONFIG_INTERNAL_STATS
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
+#if CONFIG_CONSISTENT_RECODE
+  // Backup to ensure consistency between recodes
+  save_encode_params(cpi);
+#endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, size, dest);
+    if (!encode_without_recode_loop(cpi, size, dest)) return;
   } else {
     encode_with_recode_loop(cpi, size, dest);
   }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 1e0ed70fb..f66c13046 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -282,6 +282,9 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+#if CONFIG_CONSISTENT_RECODE
+  int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
+#endif
   int8_t mode_map[BLOCK_SIZES][MAX_MODES];
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 453879fb8..c76dfd351 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2201,8 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   // Define middle frame
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
-  normal_frames =
-      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
+  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
   if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
   else
@@ -2441,9 +2440,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
       // Monitor for static sections.
-      if ((rc->frames_since_key + i - 1) > 1) {
-        zero_motion_accumulator *= get_zero_motion_factor(cpi, &next_frame);
-      }
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2464,18 +2462,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Break out conditions.
-    // Break at maximum of active_max_gf_interval unless almost totally static.
-    //
-    // Note that the addition of a test of rc->source_alt_ref_active is
-    // deliberate. The effect of this is that after a normal altref group even
-    // if the material is static there will be one normal length GF group
-    // before allowing longer GF groups. The reason for this is that in cases
-    // such as slide shows where slides are separated by a complex transition
-    // such as a fade, the arf group spanning the transition may not be coded
-    // at a very high quality and hence this frame (with its overlay) is a
-    // poor golden frame to use for an extended group.
-    if (((i >= active_max_gf_interval) &&
-         ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
+    if (
+        // Break at active_max_gf_interval unless almost totally static.
+        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
         (
             // Don't break out with a very short interval.
             (i >= active_min_gf_interval) &&
@@ -2495,8 +2484,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
-  if ((zero_motion_accumulator < 0.995) && allow_alt_ref &&
-      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
+  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+      (i >= rc->min_gf_interval)) {
     const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                    ? i - 1
                                    : VPXMAX(0, rc->frames_to_key - i);
@@ -2523,11 +2512,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
 #endif
 
-  rc->baseline_gf_interval =
-      ((twopass->kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) &&
-       (i >= rc->frames_to_key))
-          ? i
-          : (i - (is_key_frame || rc->source_alt_ref_pending));
+  // Set the interval until the next gf.
+  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
@@ -2774,7 +2760,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
-  rc->frames_since_key = 0;
 
   // Reset the GF group data structures.
   vp9_zero(*gf_group);
@@ -2919,22 +2904,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    // The zero motion test here insures that if we mark a kf group as static
-    // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
-    // It also allows for a larger boost on long static groups.
-    if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) {
+    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
       double frame_boost;
       double zm_factor;
 
       // Monitor for static sections.
-      // First frame in kf group the second ref indicator is invalid.
-      if (i > 0) {
-        zero_motion_accumulator = VPXMIN(
-            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-      } else {
-        zero_motion_accumulator =
-            next_frame.pcnt_inter - next_frame.pcnt_motion;
-      }
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
       // Factor 0.75-1.25 based on how much of frame is static.
       zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -2971,16 +2947,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Special case for static / slide show content but dont apply
-  // if the kf group is very short.
-  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
-    rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
-  } else {
-    // Apply various clamps for min and max boost
-    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
-  }
+  // Apply various clamps for min and max boost
+  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index aa497e3da..000ecd779 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
 typedef struct {
   unsigned char index;
   unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 
 typedef struct {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1ba518af8..60d5c89b1 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -224,6 +224,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   if (rv && search_subpel) {
     int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
     if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
+    if (cpi->sf.mv.enable_adaptive_subpel_force_stop) {
+      int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh;
+      if (abs(tmp_mv->as_mv.row) >= mv_thresh ||
+          abs(tmp_mv->as_mv.col) >= mv_thresh)
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above;
+      else
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below;
+    }
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
         x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -1421,7 +1429,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const SVC *const svc = &cpi->svc;
+  SVC *const svc = &cpi->svc;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
@@ -1495,7 +1503,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif
   INTERP_FILTER filter_gf_svc = EIGHTTAP;
   MV_REFERENCE_FRAME best_second_ref_frame = NONE;
-  MV_REFERENCE_FRAME spatial_ref = GOLDEN_FRAME;
+  MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME;
   const struct segmentation *const seg = &cm->seg;
   int comp_modes = 0;
   int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
@@ -1504,25 +1512,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int svc_mv_row = 0;
   int no_scaling = 0;
   unsigned int thresh_svc_skip_golden = 500;
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
-    int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id - 1,
-                                 cpi->svc.temporal_layer_id,
-                                 cpi->svc.number_temporal_layers);
-    LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+  if (cpi->use_svc && svc->spatial_layer_id > 0) {
+    int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id,
+                         svc->number_temporal_layers);
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
     if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
   }
-  if (cpi->svc.spatial_layer_id > 0 &&
-      (cpi->svc.high_source_sad_superframe || no_scaling))
+  if (svc->spatial_layer_id > 0 &&
+      (svc->high_source_sad_superframe || no_scaling))
     thresh_svc_skip_golden = 0;
   // Lower the skip threshold if lower spatial layer is better quality relative
   // to current layer.
-  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
-           cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 &&
+           cm->base_qindex > svc->lower_layer_qindex + 15)
     thresh_svc_skip_golden = 100;
   // Increase skip threshold if lower spatial layer is lower quality relative
   // to current layer.
-  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
-           cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 &&
+           cm->base_qindex < svc->lower_layer_qindex - 20)
     thresh_svc_skip_golden = 1000;
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
@@ -1585,10 +1593,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     if (cpi->use_svc) {
-      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-                                   cpi->svc.temporal_layer_id,
-                                   cpi->svc.number_temporal_layers);
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+      int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
       denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame;
     }
     if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
@@ -1623,19 +1631,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
-      cpi->svc.spatial_layer_id > 0) {
+      svc->spatial_layer_id > 0) {
     if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
-        spatial_ref = LAST_FRAME;
+        inter_layer_ref = LAST_FRAME;
       }
     }
     if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
-        spatial_ref = GOLDEN_FRAME;
+        inter_layer_ref = GOLDEN_FRAME;
       }
     }
   }
@@ -1652,6 +1660,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
   }
 
+  if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad ||
+                                 cpi->rc.avg_frame_low_motion < 60))
+    usable_ref_frame = LAST_FRAME;
+
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
@@ -1677,6 +1689,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    // Skip find_predictor if the reference frame is not in the
+    // ref_frame_flags (i.e., not used as a reference for this frame).
+    skip_ref_find_pred[ref_frame] =
+        !(cpi->ref_frame_flags & flag_list[ref_frame]);
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                       &ref_frame_skip_mask, flag_list, tile_data, mi_row,
@@ -1692,9 +1708,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // an averaging filter for downsampling (phase = 8). If so, we will test
   // a nonzero motion mode on the spatial reference.
   // The nonzero motion is half pixel shifted to left and top (-4, -4).
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      svc_force_zero_mode[spatial_ref - 1] &&
-      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc_force_zero_mode[inter_layer_ref - 1] &&
+      svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8) {
     svc_mv_col = -4;
     svc_mv_row = -4;
     flag_svc_subpel = 1;
@@ -1713,7 +1729,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int inter_mv_mode = 0;
     int skip_this_mv = 0;
     int comp_pred = 0;
-    int force_gf_mv = 0;
+    int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
     second_ref_frame = NONE;
 
@@ -1743,8 +1759,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
       continue;
 
-    if (flag_svc_subpel && ref_frame == spatial_ref) {
-      force_gf_mv = 1;
+    if (flag_svc_subpel && ref_frame == inter_layer_ref) {
+      force_mv_inter_layer = 1;
       // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
       // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
       if (this_mode == NEWMV) {
@@ -1771,8 +1787,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         sse_zeromv_normalized < thresh_svc_skip_golden)
       continue;
 
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+
     if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-        this_mode != NEARESTMV) {
+        frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
 
@@ -1802,14 +1820,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         continue;
     }
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-
     if (const_motion[ref_frame] && this_mode == NEARMV) continue;
 
     // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
     // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
     // later.
-    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_mv_inter_layer && force_skip_low_temp_var &&
+        ref_frame == GOLDEN_FRAME &&
         frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
@@ -1823,7 +1840,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
     if (cpi->use_svc) {
-      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
@@ -1883,9 +1900,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         (!cpi->sf.adaptive_rd_thresh_row_mt &&
          rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
                              &rd_thresh_freq_fact[mode_index])))
-      continue;
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
 
-    if (this_mode == NEWMV && !force_gf_mv) {
+    if (this_mode == NEWMV && !force_mv_inter_layer) {
       if (ref_frame > LAST_FRAME && !cpi->use_svc &&
           cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
@@ -1931,7 +1948,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
           // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
           // for SVC encoding.
-          if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 &&
+          if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
               frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
               frame_mv[NEWMV][ref_frame].as_mv.col == 0)
             continue;
@@ -2028,7 +2045,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
         pred_filter_search &&
         (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
+         (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
@@ -2254,12 +2271,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // layer is chosen as the reference. Always perform intra prediction if
   // LAST is the only reference, or is_key_frame is set, or on base
   // temporal layer.
-  if (cpi->svc.spatial_layer_id) {
+  if (svc->spatial_layer_id) {
     perform_intra_pred =
-        cpi->svc.temporal_layer_id == 0 ||
-        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+        svc->temporal_layer_id == 0 ||
+        svc->layer_context[svc->temporal_layer_id].is_key_frame ||
         !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
-        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+        (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 599337f80..11547fb2e 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -273,6 +273,14 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   const VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
 
+  // On dropped frame, don't update buffer if its currently stable
+  // (above optimal level). This can cause issues when full superframe
+  // can drop (!= LAYER_DROP), since QP is adjusted downwards with buffer
+  // overflow, which can cause more frame drops.
+  if (cpi->svc.framedrop_mode != LAYER_DROP && encoded_frame_size == 0 &&
+      rc->buffer_level > rc->optimal_buffer_level)
+    return;
+
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame) {
     rc->bits_off_target -= encoded_frame_size;
@@ -390,7 +398,31 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
 }
 
-static int check_buffer(VP9_COMP *cpi, int drop_mark) {
+static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) {
+    RATE_CONTROL *const rc = &cpi->rc;
+    return (rc->buffer_level > drop_mark);
+  } else {
+    int i;
+    // For SVC in the FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its above threshold, so no drop) is checked on current and
+    // upper spatial layers. If any spatial layer is not above threshold then
+    // we return 0.
+    for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      const int drop_mark_layer =
+          (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100);
+      if (!(lrc->buffer_level > drop_mark_layer)) return 0;
+    }
+    return 1;
+  }
+}
+
+static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) {
   SVC *svc = &cpi->svc;
   if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) {
     RATE_CONTROL *const rc = &cpi->rc;
@@ -398,8 +430,10 @@ static int check_buffer(VP9_COMP *cpi, int drop_mark) {
   } else {
     int i;
     // For SVC in the constrained framedrop mode (svc->framedrop_mode =
-    // CONSTRAINED_LAYER_DROP): the condition on buffer (to drop frame) is
-    // checked on current and upper spatial layers.
+    // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its below threshold, so drop frame) is checked on current
+    // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any
+    // spatial layer is <= threshold, then we return 1 (drop).
     for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
       const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
                                          svc->number_temporal_layers);
@@ -407,23 +441,42 @@ static int check_buffer(VP9_COMP *cpi, int drop_mark) {
       RATE_CONTROL *lrc = &lc->rc;
       const int drop_mark_layer =
           (int)(cpi->svc.framedrop_thresh[i] * lrc->optimal_buffer_level / 100);
-      if (!(lrc->buffer_level <= drop_mark_layer)) return 0;
+      if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) {
+        if (lrc->buffer_level <= drop_mark_layer) return 1;
+      } else {
+        if (!(lrc->buffer_level <= drop_mark_layer)) return 0;
+      }
     }
-    return 1;
+    if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP)
+      return 0;
+    else
+      return 1;
   }
 }
 
-int vp9_rc_drop_frame(VP9_COMP *cpi) {
+static int drop_frame(VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  SVC *svc = &cpi->svc;
   int drop_frames_water_mark = oxcf->drop_frames_water_mark;
-  if (cpi->use_svc)
-    drop_frames_water_mark =
-        cpi->svc.framedrop_thresh[cpi->svc.spatial_layer_id];
-  if (!drop_frames_water_mark) {
+  if (cpi->use_svc) {
+    // If we have dropped max_consec_drop frames, then we don't
+    // drop this spatial layer, and reset counter to 0.
+    if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) {
+      svc->drop_count[svc->spatial_layer_id] = 0;
+      return 0;
+    } else {
+      drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id];
+    }
+  }
+  if (!drop_frames_water_mark ||
+      (svc->spatial_layer_id > 0 &&
+       svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
     return 0;
   } else {
-    if (rc->buffer_level < 0) {
+    if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) ||
+        (check_buffer_below_thresh(cpi, -1) &&
+         svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
@@ -431,9 +484,11 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark =
           (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100);
-      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+      if (check_buffer_above_thresh(cpi, drop_mark) &&
+          (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
-      } else if (check_buffer(cpi, drop_mark) && rc->decimation_factor == 0) {
+      } else if (check_buffer_below_thresh(cpi, drop_mark) &&
+                 rc->decimation_factor == 0) {
         rc->decimation_factor = 1;
       }
       if (rc->decimation_factor > 0) {
@@ -452,6 +507,75 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
   }
 }
 
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+  SVC *svc = &cpi->svc;
+  int svc_prev_layer_dropped = 0;
+  // In the constrained or full_superframe framedrop mode for svc
+  // (framedrop_mode !=  LAYER_DROP), if the previous spatial layer was
+  // dropped, drop the current spatial layer.
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc->drop_spatial_layer[svc->spatial_layer_id - 1])
+    svc_prev_layer_dropped = 1;
+  if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP) ||
+      drop_frame(cpi)) {
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    if (cpi->use_svc) {
+      svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+      svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+      svc->drop_count[svc->spatial_layer_id]++;
+      svc->skip_enhancement_layer = 1;
+      if (svc->framedrop_mode == LAYER_DROP ||
+          svc->drop_spatial_layer[0] == 0) {
+        // For the case of constrained drop mode where the base is dropped
+        // (drop_spatial_layer[0] == 1), which means full superframe dropped,
+        // we don't increment the svc frame counters. In particular temporal
+        // layer counter (which is incremented in vp9_inc_frame_in_layer())
+        // won't be incremented, so on a dropped frame we try the same
+        // temporal_layer_id on next incoming frame. This is to avoid an
+        // issue with temporal alignement with full superframe dropping.
+        vp9_inc_frame_in_layer(cpi);
+      }
+      if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+        int i;
+        int all_layers_drop = 1;
+        for (i = 0; i < svc->spatial_layer_id; i++) {
+          if (svc->drop_spatial_layer[i] == 0) {
+            all_layers_drop = 0;
+            break;
+          }
+        }
+        if (all_layers_drop == 1) svc->skip_enhancement_layer = 0;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
+  // This makes sure q is between oscillating Qs to prevent resonance.
+  if (!cpi->rc.reset_high_source_sad &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+                       VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+    // If the previous frame had overshoot and the current q needs to increase
+    // above the clamped value, reduce the clamp for faster reaction to
+    // overshoot.
+    if (cpi->rc.rc_1_frame == -1 && q > qclamp)
+      q = (q + qclamp) >> 1;
+    else
+      q = qclamp;
+  }
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    vp9_cyclic_refresh_limit_q(cpi, &q);
+  return q;
+}
+
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   double rcf;
@@ -610,22 +734,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
     }
   } while (++i <= active_worst_quality);
 
-  // In CBR mode, this makes sure q is between oscillating Qs to prevent
-  // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
-      (!cpi->oxcf.gf_cbr_boost_pct ||
-       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
-      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
-      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-                       VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
-    // If the previous had overshoot and the current q needs to increase above
-    // the clamped value, reduce the clamp for faster reaction to overshoot.
-    if (cpi->rc.rc_1_frame == -1 && q > qclamp)
-      q = (q + qclamp) >> 1;
-    else
-      q = qclamp;
-  }
+  // Adjustment to q for CBR mode.
+  if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q);
+
   return q;
 }
 
@@ -730,8 +841,10 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
-    // Maximum limit for down adjustment, ~30%.
+    // Maximum limit for down adjustment ~30%; make it lower for screen content.
     int max_adjustment_down = active_worst_quality / 3;
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+      max_adjustment_down = active_worst_quality >> 3;
     if (max_adjustment_down) {
       buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
                        max_adjustment_down);
@@ -1118,9 +1231,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
       // Baseline value derived from cpi->active_worst_quality and kf boost.
       active_best_quality =
           get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
-      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
-        active_best_quality /= 4;
-      }
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1500,7 +1610,11 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 
   if (oxcf->pass == 0) {
-    if (cm->frame_type != KEY_FRAME) {
+    if (cm->frame_type != KEY_FRAME &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) {
       compute_frame_low_motion(cpi);
       if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
     }
@@ -1867,8 +1981,13 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
 
-    // Extended max interval for genuinely static scenes like slide shows.
-    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
 
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -2426,6 +2545,19 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
       if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
         rc->this_frame_target = rc->avg_frame_bandwidth;
     }
+    // For SVC the new (updated) avg_source_sad[0] for the current superframe
+    // updates the setting for all layers.
+    if (cpi->use_svc) {
+      int sl, tl;
+      SVC *const svc = &cpi->svc;
+      for (sl = 0; sl < svc->number_spatial_layers; ++sl)
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+          RATE_CONTROL *const lrc = &lc->rc;
+          lrc->avg_source_sad[0] = rc->avg_source_sad[0];
+        }
+    }
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
         rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2459,8 +2591,11 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
 int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  int thresh_qp = 3 * (rc->worst_quality >> 2);
-  int thresh_rate = rc->avg_frame_bandwidth * 10;
+  int thresh_qp = 7 * (rc->worst_quality >> 3);
+  int thresh_rate = rc->avg_frame_bandwidth << 3;
+  // Lower rate threshold for video.
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    thresh_rate = rc->avg_frame_bandwidth << 2;
   if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) {
     double rate_correction_factor =
         cpi->rc.rate_correction_factors[INTER_NORMAL];
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 3a40e0138..c1b210677 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,14 +34,6 @@ extern "C" {
 
 #define FRAME_OVERHEAD_BITS 200
 
-// Threshold used to define a KF group as static (e.g. a slide show).
-// Essentially this means that no frame in the group has more than 1% of MBs
-// that are not marked as coded with 0,0 motion in the first pass.
-#define STATIC_KF_GROUP_THRESH 99
-
-// The maximum duration of a GF group that is static (for example a slide show).
-#define MAX_STATIC_GF_GROUP_LENGTH 250
-
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 59022c106..8201bba70 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -108,7 +108,11 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+#if CONFIG_CONSISTENT_RECODE
+  int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
+  int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+#endif
   int RDMULT;
   int RDDIV;
 } RD_OPT;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e39df033a..e3672edf5 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -847,7 +847,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               { INT64_MAX, INT64_MAX } };
   int n;
   int s0, s1;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = ref_best_rd;
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
   const int tx_size_ctx = get_tx_size_context(xd);
@@ -868,8 +868,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   for (n = start_tx; n >= end_tx; n--) {
     const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
-    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
-                     bs, n, cpi->sf.use_fast_coef_costing);
+    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, n,
+                     cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
       r[n][1] += r_tx_size;
@@ -3612,9 +3612,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-    // If adaptive interp filter is enabled, then the current leaf node of 8x8
-    // data is needed for sub8x8. Hence preserve the context.
+// If adaptive interp filter is enabled, then the current leaf node of 8x8
+// data is needed for sub8x8. Hence preserve the context.
+#if CONFIG_CONSISTENT_RECODE
+    if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
+#else
     if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
+#endif
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 90da68726..d2842697d 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -375,6 +375,8 @@ static void set_rt_speed_feature_framesize_independent(
   sf->nonrd_keyframe = 0;
   sf->svc_use_lowres_part = 0;
   sf->re_encode_overshoot_rt = 0;
+  sf->disable_16x16part_nonkey = 0;
+  sf->disable_golden_ref = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -537,8 +539,14 @@ static void set_rt_speed_feature_framesize_independent(
     if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) sf->nonrd_keyframe = 1;
     if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
         cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
-        cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+        (cpi->use_svc || cpi->oxcf.content == VP9E_CONTENT_SCREEN)) {
       sf->re_encode_overshoot_rt = 1;
+    }
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+        cm->width <= 1280 && cm->height <= 720) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
   }
 
   if (speed >= 6) {
@@ -661,6 +669,21 @@ static void set_rt_speed_feature_framesize_independent(
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
   }
+
+  if (speed >= 9) {
+    sf->mv.enable_adaptive_subpel_force_stop = 1;
+    sf->mv.adapt_subpel_force_stop.mv_thresh = 2;
+    if (cpi->rc.avg_frame_low_motion < 40)
+      sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
+    sf->mv.adapt_subpel_force_stop.force_stop_below = 1;
+    sf->mv.adapt_subpel_force_stop.force_stop_above = 2;
+    // Disable partition blocks below 16x16, except for low-resolutions.
+    if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240)
+      sf->disable_16x16part_nonkey = 1;
+    // Allow for disabling GOLDEN reference, for CBR mode.
+    if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1;
+  }
+
   if (sf->use_altref_onepass) {
     if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
       sf->partition_search_type = FIXED_PARTITION;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 946bf0545..251cfdbcd 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -161,6 +161,17 @@ typedef enum {
   ONE_LOOP_REDUCED = 1
 } FAST_COEFF_UPDATE;
 
+typedef struct ADAPT_SUBPEL_FORCE_STOP {
+  // Threshold for full pixel motion vector;
+  int mv_thresh;
+
+  // subpel_force_stop if full pixel MV is below the threshold.
+  int force_stop_below;
+
+  // subpel_force_stop if full pixel MV is equal to or above the threshold.
+  int force_stop_above;
+} ADAPT_SUBPEL_FORCE_STOP;
+
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
@@ -189,6 +200,11 @@ typedef struct MV_SPEED_FEATURES {
   // 3: Stop at full pixel.
   int subpel_force_stop;
 
+  // If it's enabled, different subpel_force_stop will be used for different MV.
+  int enable_adaptive_subpel_force_stop;
+
+  ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop;
+
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
 } MV_SPEED_FEATURES;
@@ -515,6 +531,12 @@ typedef struct SPEED_FEATURES {
   // Enable re-encoding on scene change with potential high overshoot,
   // for real-time encoding flow.
   int re_encode_overshoot_rt;
+
+  // Disable partitioning of 16x16 blocks.
+  int disable_16x16part_nonkey;
+
+  // Allow for disabling golden reference.
+  int disable_golden_ref;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 07d1995a8..d745ae0df 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -41,7 +41,10 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
   svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON;
   svc->framedrop_mode = CONSTRAINED_LAYER_DROP;
 
-  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    svc->fb_idx_spatial_layer_id[i] = -1;
+    svc->fb_idx_temporal_layer_id[i] = -1;
+  }
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
     svc->last_layer_dropped[sl] = 0;
     svc->drop_spatial_layer[sl] = 0;
@@ -52,7 +55,10 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     svc->downsample_filter_type[sl] = BILINEAR;
     svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
     svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
+    svc->fb_idx_upd_tl0[sl] = -1;
+    svc->drop_count[sl] = 0;
   }
+  svc->max_consec_drop = INT_MAX;
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH,
@@ -787,7 +793,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = 0;
 
   if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
-      cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id]) {
+      cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] &&
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] != -1 &&
+      !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) {
     // For fixed/non-flexible mode, if the previous frame (same spatial layer
     // from previous superframe) was dropped, make sure the lst_fb_idx
     // for this frame corresponds to the buffer index updated on (last) encoded
@@ -903,12 +911,11 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
       }
     }
   }
-  // Check for disabling inter-layer prediction if
-  // INTER_LAYER_PRED_ON_CONSTRAINED is enabled.
-  // If the reference for inter-layer prediction (the reference that is scaled)
-  // is not the previous spatial layer from the same superframe, then we
-  // disable inter-layer prediction.
-  if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_ON_CONSTRAINED) {
+  // Check for disabling inter-layer prediction if the reference for inter-layer
+  // prediction (the reference that is scaled) is not the previous spatial layer
+  // from the same superframe, then we disable inter-layer prediction.
+  // Only need to check when inter_layer prediction is not set to OFF mode.
+  if (cpi->svc.disable_inter_layer_pred != INTER_LAYER_PRED_OFF) {
     // We only use LAST and GOLDEN for prediction in real-time mode, so we
     // check both here.
     MV_REFERENCE_FRAME ref_frame;
@@ -940,3 +947,46 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
     }
   }
 }
+
+void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // For fixed/non-flexible mode, the folllowing constraint are expected,
+  // when inter-layer prediciton is on (default).
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
+      svc->framedrop_mode != LAYER_DROP) {
+    if (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) {
+      // On non-key frames: LAST is always temporal reference, GOLDEN is
+      // spatial reference.
+      if (svc->temporal_layer_id == 0)
+        // Base temporal only predicts from base temporal.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0);
+      else
+        // Non-base temporal only predicts from lower temporal layer.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] <
+               svc->temporal_layer_id);
+      if (svc->spatial_layer_id > 0) {
+        // Non-base spatial only predicts from lower spatial layer with same
+        // temporal_id.
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    } else if (svc->spatial_layer_id > 0) {
+      // Only 1 reference for frame whose base is key; reference may be LAST
+      // or GOLDEN, so we check both.
+      if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] ==
+               svc->temporal_layer_id);
+      } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    }
+  }
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 617717049..9be5bb7ea 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -96,7 +96,6 @@ typedef struct SVC {
   int lst_fb_idx[VPX_MAX_LAYERS];
   int gld_fb_idx[VPX_MAX_LAYERS];
   int alt_fb_idx[VPX_MAX_LAYERS];
-  int ref_frame_index[REF_FRAMES];
   int force_zero_mode_spatial_ref;
   int current_superframe;
   int non_reference_frame;
@@ -122,6 +121,8 @@ typedef struct SVC {
   int last_layer_dropped[VPX_MAX_LAYERS];
   int drop_spatial_layer[VPX_MAX_LAYERS];
   int framedrop_thresh[VPX_MAX_LAYERS];
+  int drop_count[VPX_MAX_LAYERS];
+  int max_consec_drop;
   SVC_LAYER_DROP_MODE framedrop_mode;
 
   INTER_LAYER_PRED disable_inter_layer_pred;
@@ -141,7 +142,12 @@ typedef struct SVC {
 
   // Keep track of the frame buffer index updated/refreshed on the base
   // temporal superframe.
-  uint8_t fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
+  int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the spatial and temporal layer id of the frame that last
+  // updated the frame buffer index.
+  uint8_t fb_idx_spatial_layer_id[REF_FRAMES];
+  uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
 } SVC;
 
 struct VP9_COMP;
@@ -201,6 +207,8 @@ void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
 
 void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
 
+void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5eaa7a18a..2758314fb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -248,7 +248,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, row_mt, 0, 1);
   RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2);
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
+  RANGE_CHECK(extra_cfg, cpu_used, -9, 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -709,6 +709,8 @@ static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args);
+  // TODO(jianj): Look into issue of noise estimation with high bitdepth.
+  if (ctx->cfg.g_bit_depth > 8) extra_cfg.noise_sensitivity = 0;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1536,6 +1538,8 @@ static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx,
   cpi->svc.framedrop_mode = data->framedrop_mode;
   for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl)
     cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl];
+  // Don't allow max_consec_drop values below 1.
+  cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop);
   return VPX_CODEC_OK;
 }
 
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index b201d96f4..44519e063 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -786,7 +786,8 @@ typedef struct vpx_svc_ref_frame_config {
 typedef enum {
   CONSTRAINED_LAYER_DROP,
   /**< Upper layers are constrained to drop if current layer drops. */
-  LAYER_DROP, /**< Any spatial layer can drop. */
+  LAYER_DROP,           /**< Any spatial layer can drop. */
+  FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */
 } SVC_LAYER_DROP_MODE;
 
 /*!\brief vp9 svc frame dropping parameters.
@@ -799,7 +800,8 @@ typedef enum {
 typedef struct vpx_svc_frame_drop {
   int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */
   SVC_LAYER_DROP_MODE
-  framedrop_mode; /**< Layer-based or constrained dropping. */
+  framedrop_mode;      /**< Layer-based or constrained dropping. */
+  int max_consec_drop; /**< Maximum consecutive drops, for any layer. */
 } vpx_svc_frame_drop_t;
 
 /*!\cond */
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c
index 1370ec2d2..5afdece0a 100644
--- a/vpx_dsp/arm/avg_pred_neon.c
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
 void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
   if (width > 8) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
+    int x, y = height;
+    do {
       for (x = 0; x < width; x += 16) {
         const uint8x16_t p = vld1q_u8(pred + x);
         const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
       comp += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--y);
+  } else if (width == 8) {
+    int i = width * height;
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+      const uint8x8_t r_0 = vld1_u8(ref);
+      const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+      r = vcombine_u8(r_0, r_1);
+      ref += 2 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
   } else {
-    int i;
-    for (i = 0; i < width * height; i += 16) {
+    int i = width * height;
+    assert(width == 4);
+    do {
       const uint8x16_t p = vld1q_u8(pred);
       uint8x16_t r;
 
-      if (width == 4) {
-        r = load_unaligned_u8q(ref, ref_stride);
-        ref += 4 * ref_stride;
-      } else {
-        const uint8x8_t r_0 = vld1_u8(ref);
-        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
-        assert(width == 8);
-        r = vcombine_u8(r_0, r_1);
-        ref += 2 * ref_stride;
-      }
+      r = load_unaligned_u8q(ref, ref_stride);
+      ref += 4 * ref_stride;
       r = vrhaddq_u8(r, p);
       vst1q_u8(comp, r);
 
       pred += 16;
       comp += 16;
-    }
+      i -= 16;
+    } while (i);
   }
 }
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index ce81fb630..eef123368 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -9,71 +9,72 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
                              ptrdiff_t diff_stride, const uint8_t *src,
                              ptrdiff_t src_stride, const uint8_t *pred,
                              ptrdiff_t pred_stride) {
-  int r, c;
+  int r = rows, c;
 
   if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
+    do {
       for (c = 0; c < cols; c += 32) {
-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 =
-            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 =
-            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 =
-            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 =
-            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+        const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+        const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+        const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+        const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
       }
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x16_t v_src = vld1q_u8(&src[0]);
-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo =
-          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi =
-          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+    do {
+      const uint8x16_t s = vld1q_u8(&src[0]);
+      const uint8x16_t p = vld1q_u8(&pred[0]);
+      const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+      const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint8x8_t v_pred = vld1_u8(&pred[0]);
-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+    do {
+      const uint8x8_t s = vld1_u8(&src[0]);
+      const uint8x8_t p = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(s, p);
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
+    assert(cols == 4);
+    do {
+      const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
+      const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
+      const uint16x8_t d = vsubl_u8(s, p);
+      vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+      vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+      diff += 2 * diff_stride;
+      pred += 2 * pred_stride;
+      src += 2 * src_stride;
+      r -= 2;
+    } while (r);
   }
 }
diff --git a/vpx_dsp/ppc/deblock_vsx.c b/vpx_dsp/ppc/deblock_vsx.c
new file mode 100644
index 000000000..4329081ee
--- /dev/null
+++ b/vpx_dsp/ppc/deblock_vsx.c
@@ -0,0 +1,378 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+extern const int16_t vpx_rv[];
+
+static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
+                                       0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
+                                       0x1C, 0x1D, 0x1E, 0x1F };
+
+static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                     0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                     0x1C, 0x1D, 0x1E, 0x1F };
+
+static INLINE uint8x16_t vec_abd_s8(uint8x16_t a, uint8x16_t b) {
+  return vec_sub(vec_max(a, b), vec_min(a, b));
+}
+
+static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
+                                      uint8x16_t filter) {
+  const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
+  const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
+  const uint8x16_t k3 = vec_avg(k1, k2);
+  const uint8x16_t f_a = vec_max(vec_abd_s8(v, ctx[0]), vec_abd_s8(v, ctx[1]));
+  const uint8x16_t f_b = vec_max(vec_abd_s8(v, ctx[2]), vec_abd_s8(v, ctx[3]));
+  const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
+  return vec_sel(v, vec_avg(k3, v), mask);
+}
+
+static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
+                            int stride) {
+  ctx[0] = vec_vsx_ld(col - 2 * stride, src);
+  ctx[1] = vec_vsx_ld(col - stride, src);
+  ctx[2] = vec_vsx_ld(col + stride, src);
+  ctx[3] = vec_vsx_ld(col + 2 * stride, src);
+}
+
+static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
+                            uint8x16_t v, uint8x16_t right_ctx) {
+  static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                      0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+                                      0x1A, 0x1B, 0x1C, 0x1D };
+
+  static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+                                      0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+                                      0x1B, 0x1C, 0x1D, 0x1E };
+
+  static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+                                      0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
+                                      0x0D, 0x0E, 0x0F, 0x10 };
+
+  static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                      0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                      0x0E, 0x0F, 0x10, 0x11 };
+  ctx[0] = vec_perm(left_ctx, v, l2_perm);
+  ctx[1] = vec_perm(left_ctx, v, l1_perm);
+  ctx[2] = vec_perm(v, right_ctx, r1_perm);
+  ctx[3] = vec_perm(v, right_ctx, r2_perm);
+}
+void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
+                                              unsigned char *dst_ptr,
+                                              int src_pixels_per_line,
+                                              int dst_pixels_per_line, int cols,
+                                              unsigned char *f, int size) {
+  int row, col;
+  uint8x16_t ctx[4], out, v, left_ctx;
+
+  for (row = 0; row < size; row++) {
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    /* now post_proc_across */
+    left_ctx = vec_splats(dst_ptr[0]);
+    v = vec_vsx_ld(0, dst_ptr);
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = (col + 16 == cols)
+                                       ? vec_splats(dst_ptr[cols - 1])
+                                       : vec_vsx_ld(col, dst_ptr + 16);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+      left_ctx = v;
+      v = right_ctx;
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
+// C: s[c + 7]
+static INLINE int16x8_t next7l_s16(uint8x16_t c) {
+  static const uint8x16_t next7_perm = {
+    0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13,
+    0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17,
+  };
+  return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm);
+}
+
+// Slide across window and add.
+static INLINE int16x8_t slide_sum_s16(int16x8_t x) {
+  // x = A B C D E F G H
+  //
+  // 0 A B C D E F G
+  const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3))));
+  // 0 0 A B C D E F
+  const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))),
+                                 // 0 0 0 A B C D E
+                                 vec_slo(x, vec_splats((int8_t)(6 << 3))));
+  // 0 0 0 0 A B C D
+  const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))),
+                                 // 0 0 0 0 0 A B C
+                                 vec_slo(x, vec_splats((int8_t)(10 << 3))));
+  // 0 0 0 0 0 0 A B
+  const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))),
+                                 // 0 0 0 0 0 0 0 A
+                                 vec_slo(x, vec_splats((int8_t)(14 << 3))));
+  return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
+}
+
+// Slide across window and add.
+static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) {
+  //   0 A C E
+  // + 0 B D F
+  int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3))));
+  //   0 0 A C
+  // + 0 0 B D
+  int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3))));
+  //   0 0 0 A
+  // + 0 0 0 B
+  int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
+  sumsq_1 = vec_add(sumsq_1, xsq_even);
+  sumsq_2 = vec_add(sumsq_2, sumsq_3);
+  return vec_add(sumsq_1, sumsq_2);
+}
+
+// C: (b + sum + val) >> 4
+static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) {
+  return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4));
+}
+
+// C: sumsq * 15 - sum * sum
+static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd,
+                                  int16x8_t sum, int32x4_t lim) {
+  static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
+                                         0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
+                                         0x0C, 0x0D, 0x1C, 0x1D };
+  const int32x4_t sumsq_odd_scaled =
+      vec_mul(sumsq_odd, vec_splats((int32_t)15));
+  const int32x4_t sumsq_even_scaled =
+      vec_mul(sumsq_even, vec_splats((int32_t)15));
+  const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum));
+  const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum));
+
+  const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim);
+  const bool32x4_t mask_even = vec_cmplt(thres_even, lim);
+  return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge);
+}
+
+void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows,
+                                   int cols, int flimit) {
+  int row, col;
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+
+  for (row = 0; row < rows; row++) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    // Fill left context with first col.
+    int16x8_t left_ctx = vec_splats((int16_t)src[0]);
+    int16_t s = src[0] * 9;
+    int32_t ssq = src[0] * src[0] * 9 + 16;
+
+    // Fill the next 6 columns of the sliding window with cols 2 to 7.
+    for (col = 1; col <= 6; ++col) {
+      s += src[col];
+      ssq += src[col] * src[col];
+    }
+    // Set this sum to every element in the window.
+    sum = vec_splats(s);
+    sumsq_even = vec_splats(ssq);
+    sumsq_odd = vec_splats(ssq);
+
+    for (col = 0; col < cols; col += 8) {
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const uint8x16_t val = vec_vsx_ld(0, src + col);
+      const int16x8_t val_high = unpack_to_s16_h(val);
+
+      // C: s[c + 7]
+      const int16x8_t right_ctx = (col + 8 == cols)
+                                      ? vec_splats((int16_t)src[col + 7])
+                                      : next7l_s16(val);
+
+      // C: x = s[c + 7] - s[c - 8];
+      const int16x8_t x = vec_sub(right_ctx, left_ctx);
+      const int32x4_t xsq_even =
+          vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx));
+      const int32x4_t xsq_odd =
+          vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx));
+
+      const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd);
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_even = vec_add(sumsq_even, sumsq_tmp);
+      // B D F G
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
+
+      sum = vec_add(sum, slide_sum_s16(x));
+
+      // C: (8 + sum + s[c]) >> 4
+      filtered = filter_s16(vec_splats((int16_t)8), sum, val_high);
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(val_high, filtered, mask);
+
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge);
+      vec_vsx_st(out, 0, src + col);
+
+      // Update window sum and square sum
+      sum = vec_splat(sum, 7);
+      sumsq_even = vec_splat(sumsq_odd, 3);
+      sumsq_odd = vec_splat(sumsq_odd, 3);
+
+      // C: s[c - 8] (for next iteration)
+      left_ctx = val_high;
+    }
+    src += pitch;
+  }
+}
+
+void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols,
+                              int flimit) {
+  int col, row, i;
+  int16x8_t window[16];
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+  // If rows is less than 8 the bottom border extension fails.
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t r1, sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    r1 = unpack_to_s16_h(vec_vsx_ld(0, dst));
+    // Fill sliding window with first row.
+    for (i = 0; i <= 8; i++) {
+      window[i] = r1;
+    }
+    // First 9 rows of the sliding window are the same.
+    // sum = r1 * 9
+    sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
+
+    // sumsq = r1 * r1 * 9
+    sumsq_even = vec_mule(sum, r1);
+    sumsq_odd = vec_mulo(sum, r1);
+
+    // Fill the next 6 rows of the sliding window with rows 2 to 7.
+    for (i = 1; i <= 6; ++i) {
+      const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
+      window[i + 8] = next_row;
+      sum = vec_add(sum, next_row);
+      sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
+      sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
+    }
+
+    for (row = 0; row < rows; row++) {
+      int32x4_t d15_even, d15_odd, d0_even, d0_odd;
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127));
+
+      // Move the sliding window
+      if (row + 7 < rows) {
+        window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst));
+      } else {
+        window[15] = window[14];
+      }
+
+      // C: sum += s[7 * pitch] - s[-8 * pitch];
+      sum = vec_add(sum, vec_sub(window[15], window[0]));
+
+      // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 *
+      // pitch];
+      // Optimization Note: Caching a squared-window for odd and even is
+      // slower than just repeating the multiplies.
+      d15_odd = vec_mulo(window[15], window[15]);
+      d15_even = vec_mule(window[15], window[15]);
+      d0_odd = vec_mulo(window[0], window[0]);
+      d0_even = vec_mule(window[0], window[0]);
+      sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd));
+      sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even));
+
+      // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4
+      filtered = filter_s16(rv, sum, window[8]);
+
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(window[8], filtered, mask);
+
+      // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per
+      // iteration
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
+                     load_merge);
+      vec_vsx_st(out, 0, dst + row * pitch);
+
+      // Optimization Note: Turns out that the following loop is faster than
+      // using pointers to manage the sliding window.
+      for (i = 1; i < 16; i++) {
+        window[i - 1] = window[i];
+      }
+    }
+    dst += 8;
+  }
+}
diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c
index f095cb0a4..6603b85ac 100644
--- a/vpx_dsp/ppc/inv_txfm_vsx.c
+++ b/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -76,6 +76,8 @@ static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
 static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
 static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
 
+static uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+                            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
 #define ROUND_SHIFT_INIT                                               \
   const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
   const uint32x4_t shift14 = vec_splat_u32(14);
@@ -107,6 +109,15 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
   out1 = vec_sub(step0, step1);                                               \
   out1 = vec_perm(out1, out1, mask0);
 
+#define PACK_STORE(v0, v1)                            \
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \
+  output_v = vec_packsu(tmp16_0, tmp16_1);            \
+                                                      \
+  vec_vsx_st(output_v, 0, tmp_dest);                  \
+  for (i = 0; i < 4; i++)                             \
+    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+
 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
                             int stride) {
   int i, j;
@@ -114,13 +125,10 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
   uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
                        0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
-  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
-                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
   int16x8_t v0 = load_tran_low(0, input);
   int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
   int16x8_t t0 = vec_mergeh(v0, v1);
   int16x8_t t1 = vec_mergel(v0, v1);
-
   uint8x16_t dest0 = vec_vsx_ld(0, dest);
   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
@@ -130,6 +138,7 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
   int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+
   uint8x16_t output_v;
   uint8_t tmp_dest[16];
   ROUND_SHIFT_INIT
@@ -148,13 +157,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
 
   PIXEL_ADD4(v0, t_out0);
   PIXEL_ADD4(v1, t_out1);
-  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
-  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
-  output_v = vec_packsu(tmp16_0, tmp16_1);
 
-  vec_vsx_st(output_v, 0, tmp_dest);
-  for (i = 0; i < 4; i++)
-    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+  PACK_STORE(v0, v1);
 }
 
 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
@@ -1062,3 +1066,67 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
   ADD_STORE_BLOCK(src2, 16);
   ADD_STORE_BLOCK(src3, 24);
 }
+
+#define TRANSFORM_COLS           \
+  v32_a = vec_add(v32_a, v32_c); \
+  v32_d = vec_sub(v32_d, v32_b); \
+  v32_e = vec_sub(v32_a, v32_d); \
+  v32_e = vec_sra(v32_e, one);   \
+  v32_b = vec_sub(v32_e, v32_b); \
+  v32_c = vec_sub(v32_e, v32_c); \
+  v32_a = vec_sub(v32_a, v32_b); \
+  v32_d = vec_add(v32_d, v32_c); \
+  v_a = vec_packs(v32_a, v32_b); \
+  v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT             \
+  tmp_a = vec_mergeh(v_a, v_c);   \
+  tmp_c = vec_mergel(v_a, v_c);   \
+  v_a = vec_mergeh(tmp_a, tmp_c); \
+  v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t v_a = load_tran_low(0, input);
+  int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t tmp_a, tmp_c;
+  uint16x8_t two = vec_splat_u16(2);
+  uint32x4_t one = vec_splat_u32(1);
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+  int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+  int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+  int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  int i, j;
+
+  v_a = vec_sra(v_a, two);
+  v_c = vec_sra(v_c, two);
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  PACK_STORE(v_a, v_c);
+}
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index e037f89e3..3a9092f64 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -20,31 +20,70 @@ static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
   return vec_xor(vec_add(a, mask), mask);
 }
 
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+  return vec_sr(a, vec_shift_sign_s32);
+}
+
 // Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
 // integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
 static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
   // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
   // shift.
-  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_s16);
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
 }
 
+// Quantization function used for 4x4, 8x8 and 16x16 blocks.
 static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
                                        int16x8_t round, int16x8_t quant,
                                        int16x8_t quant_shift, bool16x8_t mask) {
-  int16x8_t rounded, qcoeff;
-  rounded = vec_vaddshs(coeff_abs, round);
-  qcoeff = vec_mulhi(rounded, quant);
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
   qcoeff = vec_add(qcoeff, rounded);
   qcoeff = vec_mulhi(qcoeff, quant_shift);
   qcoeff = vec_sign(qcoeff, coeff);
   return vec_and(qcoeff, mask);
 }
 
+// Quantization function used for 32x32 blocks.
+static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs,
+                                          int16x8_t round, int16x8_t quant,
+                                          int16x8_t quant_shift,
+                                          bool16x8_t mask) {
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  // 32x32 blocks require an extra multiplication by 2, this compensates for the
+  // extra right shift added in vec_mulhi, as such vec_madds can be used
+  // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15)
+  qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+                                            int16x8_t dequant) {
+  int16x8_t dqcoeff;
+  int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+  int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+  dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+  dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+  dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+  dqcoeff = vec_pack(dqcoeffe, dqcoeffo);
+  return vec_perm(dqcoeff, dqcoeff, vec_perm_merge);
+}
+
 static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
-                                          const int16_t *iscan_ptr) {
-  bool16x8_t zero_coeff;
-  int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
-  zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+                                          const int16_t *iscan_ptr, int index) {
+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
   scan = vec_sub(scan, mask);
   return vec_andc(scan, zero_coeff);
 }
@@ -64,7 +103,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan_ptr,
                         const int16_t *iscan_ptr) {
-  int16x8_t qcoeff, dqcoeff, eob;
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
 
   // First set of 8 coeff starts with DC + 7 AC
   int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
@@ -73,51 +113,194 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
   int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
 
-  int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
-  int16x8_t coeff_abs = vec_abs(coeff);
-  bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
 
   (void)scan_ptr;
   (void)skip_block;
   assert(!skip_block);
 
-  qcoeff =
-      quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-  vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
-  dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-  vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
-  eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
-
-  // All other sets of 8 coeffs will only contain AC
-  zbin = vec_splat(zbin, 1);
+  qcoeff0 =
+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
   round = vec_splat(round, 1);
   quant = vec_splat(quant, 1);
-  dequant = vec_splat(dequant, 1);
   quant_shift = vec_splat(quant_shift, 1);
+  qcoeff1 =
+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+
+  if (n_coeffs > 16) {
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+    do {
+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+      bool16x8_t zero_mask2;
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      coeff0_abs = vec_abs(coeff0);
+      coeff1_abs = vec_abs(coeff1);
+      coeff2_abs = vec_abs(coeff2);
+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+                               zero_mask0);
+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+                               zero_mask1);
+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+                               zero_mask2);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob =
+          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
+
+void vpx_quantize_b_32x32_vsx(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+  // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+  // (32 * 32 - 16) / 24 = 42
+  int num_itr = 42;
+  // Offsets are in bytes, 16 coeffs = 32 bytes
+  int off0 = 32;
+  int off1 = 48;
+  int off2 = 64;
+
+  int16x8_t qcoeff0, qcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
+
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  (void)scan_ptr;
+  (void)skip_block;
+  (void)n_coeffs;
+  assert(!skip_block);
+
+  // 32x32 quantization requires that zbin and round be divided by 2
+  zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
+  round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);  // remove DC from zbin
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+  qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                              zero_mask0);
+  round = vec_splat(round, 1);              // remove DC from round
+  quant = vec_splat(quant, 1);              // remove DC from quant
+  quant_shift = vec_splat(quant_shift, 1);  // remove DC from quant_shift
+  qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                              zero_mask1);
+
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);  // remove DC from dequant
+  vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
 
-  n_coeffs -= 8;
   do {
-    coeff_ptr += 8;
-    qcoeff_ptr += 8;
-    dqcoeff_ptr += 8;
-    iscan_ptr += 8;
+    int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
+    bool16x8_t zero_mask2;
+
+    coeff0 = vec_vsx_ld(off0, coeff_ptr);
+    coeff1 = vec_vsx_ld(off1, coeff_ptr);
+    coeff2 = vec_vsx_ld(off2, coeff_ptr);
+
+    coeff0_abs = vec_abs(coeff0);
+    coeff1_abs = vec_abs(coeff1);
+    coeff2_abs = vec_abs(coeff2);
+
+    zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+    zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+    zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+
+    qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                                zero_mask0);
+    qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                                zero_mask1);
+    qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift,
+                                zero_mask2);
 
-    coeff = vec_vsx_ld(0, coeff_ptr);
-    coeff_abs = vec_abs(coeff);
-    zero_mask = vec_cmpge(coeff_abs, zbin);
-    qcoeff =
-        quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-    vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+    vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+    vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+    vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
 
-    dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-    vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
 
-    eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+    eob = vec_max(eob, eob2);
 
-    n_coeffs -= 8;
-  } while (n_coeffs > 0);
+    // 24 int16_t is 48 bytes
+    off0 += 48;
+    off1 += 48;
+    off2 += 48;
+    num_itr--;
+  } while (num_itr != 0);
 
   eob = vec_max_across(eob);
   *eob_ptr = eob[0];
diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h
index e2af55463..c6c7ce9f1 100644
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -19,7 +19,9 @@ typedef vector signed short int16x8_t;
 typedef vector unsigned short uint16x8_t;
 typedef vector signed int int32x4_t;
 typedef vector unsigned int uint32x4_t;
+typedef vector bool char bool8x16_t;
 typedef vector bool short bool16x8_t;
+typedef vector bool int bool32x4_t;
 
 #ifdef __clang__
 static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
@@ -66,9 +68,15 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
 #endif
 #endif
 
+static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0 };
 static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
-static const uint16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 };
+static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 };
 static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 };
 static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
                                        0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
                                        0x04, 0x05, 0x06, 0x07 };
@@ -79,4 +87,8 @@ static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                                        0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
                                        0x0E, 0x0F, 0x00, 0x01 };
 
+static const uint8x16_t vec_perm_merge = { 0x00, 0x01, 0x08, 0x09, 0x02, 0x03,
+                                           0x0A, 0x0B, 0x04, 0x05, 0x0C, 0x0D,
+                                           0x06, 0x07, 0x0E, 0x0F };
+
 #endif  // VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index 1efe2f005..d3f257b63 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -10,10 +10,11 @@
 
 #include <assert.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ppc/types_vsx.h"
 
-static inline uint8x16_t read4x2(const uint8_t *a, int stride) {
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
   const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
   const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
 
@@ -101,3 +102,174 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
     }
   }
 }
+
+static INLINE void variance_inner_32(const uint8_t *a, const uint8_t *b,
+                                     int32x4_t *sum_squared, int32x4_t *sum) {
+  int32x4_t s = *sum;
+  int32x4_t ss = *sum_squared;
+
+  const uint8x16_t va0 = vec_vsx_ld(0, a);
+  const uint8x16_t vb0 = vec_vsx_ld(0, b);
+  const uint8x16_t va1 = vec_vsx_ld(16, a);
+  const uint8x16_t vb1 = vec_vsx_ld(16, b);
+
+  const int16x8_t a0 = unpack_to_s16_h(va0);
+  const int16x8_t b0 = unpack_to_s16_h(vb0);
+  const int16x8_t a1 = unpack_to_s16_l(va0);
+  const int16x8_t b1 = unpack_to_s16_l(vb0);
+  const int16x8_t a2 = unpack_to_s16_h(va1);
+  const int16x8_t b2 = unpack_to_s16_h(vb1);
+  const int16x8_t a3 = unpack_to_s16_l(va1);
+  const int16x8_t b3 = unpack_to_s16_l(vb1);
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int16x8_t d2 = vec_sub(a2, b2);
+  const int16x8_t d3 = vec_sub(a3, b3);
+
+  s = vec_sum4s(d0, s);
+  ss = vec_msum(d0, d0, ss);
+  s = vec_sum4s(d1, s);
+  ss = vec_msum(d1, d1, ss);
+  s = vec_sum4s(d2, s);
+  ss = vec_msum(d2, d2, ss);
+  s = vec_sum4s(d3, s);
+  ss = vec_msum(d3, d3, ss);
+  *sum = s;
+  *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, int w, int h, uint32_t *sse,
+                            int *sum) {
+  int i;
+
+  int32x4_t s = vec_splat_s32(0);
+  int32x4_t ss = vec_splat_s32(0);
+
+  switch (w) {
+    case 4:
+      for (i = 0; i < h / 2; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride));
+        const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride));
+        const int16x8_t d = vec_sub(a0, b0);
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        a += a_stride * 2;
+        b += b_stride * 2;
+      }
+      break;
+    case 8:
+      for (i = 0; i < h; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, a));
+        const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, b));
+        const int16x8_t d = vec_sub(a0, b0);
+
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 16:
+      for (i = 0; i < h; ++i) {
+        const uint8x16_t va = vec_vsx_ld(0, a);
+        const uint8x16_t vb = vec_vsx_ld(0, b);
+        const int16x8_t a0 = unpack_to_s16_h(va);
+        const int16x8_t b0 = unpack_to_s16_h(vb);
+        const int16x8_t a1 = unpack_to_s16_l(va);
+        const int16x8_t b1 = unpack_to_s16_l(vb);
+        const int16x8_t d0 = vec_sub(a0, b0);
+        const int16x8_t d1 = vec_sub(a1, b1);
+
+        s = vec_sum4s(d0, s);
+        ss = vec_msum(d0, d0, ss);
+        s = vec_sum4s(d1, s);
+        ss = vec_msum(d1, d1, ss);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 32:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(a, b, &ss, &s);
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 64:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(a, b, &ss, &s);
+        variance_inner_32(a + 32, b + 32, &ss, &s);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste(s, 0, sum);
+
+  ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                            \
+  void vpx_get##W##x##H##var_vsx(const uint8_t *a, int a_stride, \
+                                 const uint8_t *b, int b_stride, \
+                                 uint32_t *sse, int *sum) {      \
+    variance(a, a_stride, b, b_stride, W, H, sse, sum);          \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                                 \
+  uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *a, int a_stride, \
+                                  const uint8_t *b, int b_stride, \
+                                  uint32_t *sse) {                \
+    int sum;                                                      \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);          \
+    return *sse;                                                  \
+  }
+
+#define VAR(W, H)                                                      \
+  uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *a, int a_stride, \
+                                       const uint8_t *b, int b_stride, \
+                                       uint32_t *sse) {                \
+    int sum;                                                           \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);               \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));          \
+  }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index cb06a476f..573d6fef1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -69,6 +69,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
 DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
+DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c
 endif # CONFIG_POSTPROC
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 93ecd7c19..9661f3bd8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -626,7 +626,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
-  specialize qw/vpx_iwht4x4_16_add sse2/;
+  specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
@@ -702,7 +702,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -1082,64 +1082,64 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
+  specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
+  specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
+  specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
+  specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
 
 #
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa/;
+  specialize qw/vpx_get8x8var sse2 neon msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi/;
+  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa mmi/;
+  specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa mmi/;
+  specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
@@ -1598,13 +1598,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
     specialize qw/vpx_plane_add_noise sse2 msa/;
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
 
     add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/;
 
 }
 
diff --git a/vpx_ports/config.h b/vpx_ports/config.h
deleted file mode 100644
index 3c1ab99f4..000000000
--- a/vpx_ports/config.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_PORTS_CONFIG_H_
-#define VPX_PORTS_CONFIG_H_
-
-#include "vpx_config.h"
-
-#endif  // VPX_PORTS_CONFIG_H_