64 files changed, 3381 insertions, 578 deletions
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 09bdc5d2f..6d96244e0 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -202,11 +202,12 @@ clean:
 	@$(RM) $(CLEAN-OBJS)
 
 ifeq ($(ENABLE_SHARED),1)
+  LOCAL_CFLAGS += -fPIC
   include $(BUILD_SHARED_LIBRARY)
 else
   include $(BUILD_STATIC_LIBRARY)
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
-$(call import-module,cpufeatures)
+$(call import-module,android/cpufeatures)
 endif
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 7272424af..029cc4a56 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -138,14 +138,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # RN to .req
-    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
-    {
-        print;
-        print "$comment_sub$comment\n" if defined $comment;
-        next;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 1a9e105ba..e1ae7b4f8 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -120,18 +120,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # Build a hash of all the register - alias pairs.
-    if (s/(.*)RN(.*)/$1 .req $2/g)
-    {
-        $register_aliases{trim($1)} = trim($2);
-        next;
-    }
-
-    while (($key, $value) = each(%register_aliases))
-    {
-        s/\b$key\b/$value/g;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f050fa06a..007e02000 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -936,6 +936,7 @@ EOF
             # only "AppContainerApplication" which requires an AppxManifest.
             # Therefore disable the examples, just build the library.
             disable_feature examples
+            disable_feature tools
           fi
           ;;
         rvct)
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index e3395afa2..2cf62c117 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -82,7 +82,7 @@ generate_filter() {
                        | sed -e "s,$src_path_bare,," \
                              -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
                     # vpx_config.asm produce the same object file without
                     # this additional suffix.
@@ -452,7 +452,7 @@ generate_vcxproj() {
     done
 
     open_tag ItemGroup
-    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
+    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S"
     close_tag ItemGroup
     open_tag ItemGroup
     generate_filter "Header Files"   "h;hm;inl;inc;xsd"
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
new file mode 100644
index 000000000..69f62f13b
--- /dev/null
+++ b/test/decode_svc_test.cc
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+
+namespace {
+
+const unsigned int kNumFrames = 19;
+
+class DecodeSvcTest : public ::libvpx_test::DecoderTest,
+                      public ::libvpx_test::CodecTestWithParam<const char *> {
+ protected:
+  DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
+  virtual ~DecodeSvcTest() {}
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (video.frame_number() == 0)
+      decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_EQ(img.d_w, width_);
+    ASSERT_EQ(img.d_h, height_);
+    total_frames_ = frame_number;
+  }
+
+  int spatial_layer_;
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int total_frames_;
+};
+
+// SVC test vector is 1280x720, with 3 spatial layers, and 20 frames.
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 0. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/4x1/4 resolution (320x180).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 0;
+  width_ = 320;
+  height_ = 180;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 1. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/2x1/2 resolution (640x360).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 1;
+  width_ = 640;
+  height_ = 360;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 2. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 2;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 10. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This is beyond the number of spatial layers, so
+// the decoding should result in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 10;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc,
+                                       libvpx_test::kVP9TestVectorsSvc +
+                                           libvpx_test::kNumVP9TestVectorsSvc));
+}  // namespace
diff --git a/test/examples.sh b/test/examples.sh
index 39f7e392d..629f04239 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
 example_tests=$(ls $(dirname $0)/*.sh)
 
 # List of script names to exclude.
-exclude_list="examples tools_common"
+exclude_list="examples stress tools_common"
 
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 1270dae94..444b0209d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -438,14 +438,12 @@ INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
                                                      VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
                         ::testing::Values(make_tuple(&vpx_fdct4x4_c,
                                                      &vpx_idct4x4_16_add_neon,
                                                      0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4HT,
     ::testing::Values(
@@ -453,7 +451,8 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 3ac73c125..e40340490 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -105,20 +105,20 @@ void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 
 #if HAVE_SSE2
 
-void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
+void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
 }
 
-void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
+void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
 }
 
-void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
 }
 
-void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
 }
 
 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
@@ -670,14 +670,17 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                     &vpx_idct8x8_64_add_neon,
+                                                     0, VPX_BITS_8)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                      &vpx_idct8x8_64_add_neon,
                                                      0, VPX_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8HT,
     ::testing::Values(
@@ -685,6 +688,7 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -728,10 +732,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, InvTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&idct8x8_10_add_10_c, &idct8x8_10_add_10_sse2, 6225,
+        make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225,
                    VPX_BITS_10),
         make_tuple(&idct8x8_10, &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
-        make_tuple(&idct8x8_10_add_12_c, &idct8x8_10_add_12_sse2, 6225,
+        make_tuple(&idct8x8_12_add_12_c, &idct8x8_12_add_12_sse2, 6225,
                    VPX_BITS_12),
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 0c704c5c8..2512366ed 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -12,6 +12,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
@@ -32,6 +34,38 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, InvTxfmFunc, TX_SIZE, int>
     PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
+
+// https://bugs.chromium.org/p/webm/issues/detail?id=1332
+// The functions specified do not pass with INT16_MIN/MAX. They fail at the
+// value specified, but pass when 1 is added/subtracted.
+int16_t MaxSupportedCoeff(InvTxfmFunc a) {
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+  if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) {
+    return 23625 - 1;
+  }
+#else
+  (void)a;
+#endif
+  return std::numeric_limits<int16_t>::max();
+}
+
+int16_t MinSupportedCoeff(InvTxfmFunc a) {
+  (void)a;
+#if !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
+  if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) {
+    return -23625 + 1;
+  }
+#elif HAVE_NEON
+  if (a == vpx_idct4x4_16_add_neon) {
+    return std::numeric_limits<int16_t>::min() + 1;
+  }
+#endif
+#endif  // !CONFIG_EMULATE_HARDWARE
+  return std::numeric_limits<int16_t>::min();
+}
+
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
   virtual ~PartialIDctTest() {}
@@ -142,14 +176,14 @@ TEST_P(PartialIDctTest, ResultsMatch) {
     memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
     int max_energy_leftover = max_coeff * max_coeff;
     for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                          (rnd.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coef * coef;
+      int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                           (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coeff * coeff;
       if (max_energy_leftover < 0) {
         max_energy_leftover = 0;
-        coef = 0;
+        coeff = 0;
       }
-      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coeff;
     }
 
     ASM_REGISTER_STATE_CHECK(
@@ -186,6 +220,33 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
         << "Error: Transform results are not correctly added to output.";
   }
 }
+
+TEST_P(PartialIDctTest, SingleExtremeCoeff) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int16_t max_coeff = MaxSupportedCoeff(partial_itxfm_);
+  const int16_t min_coeff = MinSupportedCoeff(partial_itxfm_);
+  for (int i = 0; i < last_nonzero_; ++i) {
+    memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+    // Run once for min and once for max.
+    for (int j = 0; j < 2; ++j) {
+      const int coeff = j ? min_coeff : max_coeff;
+
+      memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+      memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
+      input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
+
+      ASM_REGISTER_STATE_CHECK(
+          full_itxfm_(input_block_, output_block_ref_, size_));
+      ASM_REGISTER_STATE_CHECK(
+          partial_itxfm_(input_block_, output_block_, size_));
+
+      ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                          sizeof(*output_block_) * block_size_))
+          << "Error: Fails with single coeff of " << coeff << " at " << i
+          << ".";
+    }
+  }
+}
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -221,20 +282,27 @@ INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
                                  &vpx_idct32x32_1_add_neon, TX_32X32, 1),
+                      make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
+                                 &vpx_idct32x32_34_add_neon, TX_32X32, 34),
                       make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
                                  &vpx_idct16x16_1_add_neon, TX_16X16, 1),
                       make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
+                                 &vpx_idct8x8_64_add_neon, TX_8X8, 64),
+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
+                                 &vpx_idct8x8_12_add_neon, TX_8X8, 12),
+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
                                  &vpx_idct8x8_1_add_neon, TX_8X8, 1),
                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
+                                 &vpx_idct4x4_16_add_neon, TX_4X4, 16),
+                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
                                  &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
 #else   // !CONFIG_VP9_HIGHBITDEPTH
-// 32x32_135_ is implemented using the 1024 version.
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
                                  &vpx_idct32x32_1024_add_neon, TX_32X32, 1024),
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1024_add_neon, TX_32X32, 135),
+                                 &vpx_idct32x32_135_add_neon, TX_32X32, 135),
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
                                  &vpx_idct32x32_34_add_neon, TX_32X32, 34),
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 2e34fed06..afa32cba2 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,22 +7,39 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr,
-                             int src_pixels_per_line, int dst_pixels_per_line,
-                             int cols, unsigned char *flimit, int size);
+using libvpx_test::ACMRandom;
+
+typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+    unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
+    int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
+
+typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch,
+                                          int rows, int cols, int flimit);
+
+typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
+                                      int cols, int flimit);
 
 namespace {
 
-class VPxPostProcessingFilterTest
-    : public ::testing::TestWithParam<PostProcFunc> {
+// Compute the filter level used in post proc from the loop filter strength
+int q2mbl(int x) {
+  if (x < 20) x = 20;
+
+  x = 50 + (x - 50) * 10 / 8;
+  return x * x / 3;
+}
+
+class VpxPostProcDownAndAcrossMbRowTest
+    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 };
@@ -30,7 +47,7 @@ class VPxPostProcessingFilterTest
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
-TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
   const int block_width = 16;
   const int block_height = 16;
@@ -47,14 +64,20 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
   const int output_stride = output_width;
   const int output_size = output_width * output_height;
 
-  uint8_t *const src_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(input_size, 1));
-  uint8_t *const dst_image =
-      reinterpret_cast<uint8_t *>(vpx_calloc(output_size, 1));
+  uint8_t *const src_image = new uint8_t[input_size];
+  ASSERT_TRUE(src_image != NULL);
+
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  uint8_t *const dst_image = new uint8_t[output_size + 8];
+  ASSERT_TRUE(dst_image != NULL);
 
   // Pointers to top-left pixel of block in the input and output images.
   uint8_t *const src_image_ptr = src_image + (input_stride << 1);
-  uint8_t *const dst_image_ptr = dst_image + 8;
+
+  // The assembly works in increments of 16. The first read may be offset by
+  // this amount.
+  uint8_t *const dst_image_ptr = dst_image + 16;
   uint8_t *const flimits =
       reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
   (void)memset(flimits, 255, block_width);
@@ -78,37 +101,238 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
                                       input_stride, output_stride, block_width,
                                       flimits, 16));
 
-  static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1,
-                                                       1, 1, 1, 1, 1, 1, 3, 4 };
+  static const uint8_t kExpectedOutput[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
 
   pixel_ptr = dst_image_ptr;
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
-      EXPECT_EQ(expected_data[i], pixel_ptr[j])
-          << "VPxPostProcessingFilterTest failed with invalid filter output";
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j
+                                                  << ")";
     }
     pixel_ptr += output_stride;
   }
 
-  vpx_free(src_image);
-  vpx_free(dst_image);
+  delete[] src_image;
+  delete[] dst_image;
   vpx_free(flimits);
 };
 
+class VpxMbPostProcAcrossIpTest
+    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+};
+
+TEST_P(VpxMbPostProcAcrossIpTest, CheckFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+  const int src_left_padding = 8;
+  const int src_right_padding = 17;
+  const int src_width = cols + src_left_padding + src_right_padding;
+  const int src_size = rows * src_width;
+
+  unsigned char *const src = new unsigned char[src_size];
+  ASSERT_TRUE(src != NULL);
+  memset(src, 10, src_size);
+  unsigned char *s = src + src_left_padding;
+  for (int r = 0; r < rows; r++) {
+    for (int c = 0; c < cols; c++) {
+      s[c] = c;
+    }
+    s += src_width;
+  }
+
+  s = src + src_left_padding;
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, src_width, rows, cols, q2mbl(100)));
+
+  static const unsigned char kExpectedOutput[cols] = {
+    2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
+  };
+  s = src + src_left_padding;
+  for (int r = 0; r < rows; r++) {
+    for (int c = 0; c < cols; c++) {
+      ASSERT_EQ(kExpectedOutput[c], s[c]) << "at (" << r << ", " << c << ")";
+    }
+    s += src_width;
+  }
+
+  delete[] src;
+}
+
+class VpxMbPostProcDownTest
+    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void SetRows(unsigned char *src_c, int rows, int cols) {
+    for (int r = 0; r < rows; r++) {
+      memset(src_c, r, cols);
+      src_c += cols;
+    }
+  }
+
+  void SetRandom(unsigned char *src_c, unsigned char *src_asm, int rows,
+                 int cols, int src_pitch) {
+    ACMRandom rnd;
+    rnd.Reset(ACMRandom::DeterministicSeed());
+
+    // Add some random noise to the input
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        const int noise = rnd(4);
+        src_c[c] = r + noise;
+        src_asm[c] = r + noise;
+      }
+      src_c += src_pitch;
+      src_asm += src_pitch;
+    }
+  }
+
+  void SetRandomSaturation(unsigned char *src_c, unsigned char *src_asm,
+                           int rows, int cols, int src_pitch) {
+    ACMRandom rnd;
+    rnd.Reset(ACMRandom::DeterministicSeed());
+
+    // Add some random noise to the input
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        const int noise = 3 * rnd(2);
+        src_c[c] = r + noise;
+        src_asm[c] = r + noise;
+      }
+      src_c += src_pitch;
+      src_asm += src_pitch;
+    }
+  }
+
+  void RunComparison(const unsigned char *kExpectedOutput, unsigned char *src_c,
+                     int rows, int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(kExpectedOutput[r * rows + c], src_c[c]) << "at (" << r
+                                                           << ", " << c << ")";
+      }
+      src_c += src_pitch;
+    }
+  }
+
+  void RunComparison(unsigned char *src_c, unsigned char *src_asm, int rows,
+                     int cols, int src_pitch) {
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        ASSERT_EQ(src_c[c], src_asm[c]) << "at (" << r << ", " << c << ")";
+      }
+      src_c += src_pitch;
+      src_asm += src_pitch;
+    }
+  }
+};
+
+TEST_P(VpxMbPostProcDownTest, CheckFilterOutput) {
+  const int rows = 16;
+  const int cols = 16;
+  const int src_pitch = cols;
+  const int src_top_padding = 8;
+  const int src_bottom_padding = 17;
+
+  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
+  unsigned char *c_mem = new unsigned char[src_size];
+  ASSERT_TRUE(c_mem != NULL);
+  memset(c_mem, 10, src_size);
+  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
+
+  SetRows(src_c, rows, cols);
+  ASM_REGISTER_STATE_CHECK(
+      GetParam()(src_c, src_pitch, rows, cols, q2mbl(100)));
+
+  static const unsigned char kExpectedOutput[rows * cols] = {
+    2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
+    2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
+    4,  4,  3,  4,  4,  3,  3,  4,  5,  4,  4,  4,  4,  4,  4,  4,  5,  4,  4,
+    4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+    5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,
+    7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  8,  9,  9,  8,  8,  8,  9,
+    9,  8,  9,  9,  8,  8,  8,  9,  9,  10, 10, 9,  9,  9,  10, 10, 9,  10, 10,
+    9,  9,  9,  10, 10, 10, 11, 10, 10, 10, 11, 10, 11, 10, 11, 10, 10, 10, 11,
+    10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 11, 11, 11, 12, 11, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12,
+    13, 12, 13, 12, 12, 12, 13, 12, 13, 12, 13, 12, 13, 13, 13, 14, 13, 13, 13,
+    13, 13, 13, 13, 14, 13, 13, 13, 13
+  };
+
+  RunComparison(kExpectedOutput, src_c, rows, cols, src_pitch);
+
+  delete[] c_mem;
+}
+
+TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
+  const int rows = 16;
+  const int cols = 16;
+  const int src_pitch = cols;
+  const int src_top_padding = 8;
+  const int src_bottom_padding = 17;
+  const int src_size = cols * (rows + src_top_padding + src_bottom_padding);
+  unsigned char *c_mem = new unsigned char[src_size];
+  unsigned char *asm_mem = new unsigned char[src_size];
+  memset(c_mem, 10, src_size);
+  memset(asm_mem, 10, src_size);
+  unsigned char *const src_c = c_mem + src_top_padding * src_pitch;
+  unsigned char *const src_asm = asm_mem + src_top_padding * src_pitch;
+
+  SetRandom(src_c, src_asm, rows, cols, src_pitch);
+  vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(100));
+  ASM_REGISTER_STATE_CHECK(
+      GetParam()(src_asm, src_pitch, rows, cols, q2mbl(100)));
+  RunComparison(src_c, src_asm, rows, cols, src_pitch);
+
+  SetRandomSaturation(src_c, src_asm, rows, cols, src_pitch);
+  vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(100));
+  ASM_REGISTER_STATE_CHECK(
+      GetParam()(src_asm, src_pitch, rows, cols, q2mbl(100)));
+  RunComparison(src_c, src_asm, rows, cols, src_pitch);
+
+  delete[] c_mem;
+  delete[] asm_mem;
+}
+
 INSTANTIATE_TEST_CASE_P(
-    C, VPxPostProcessingFilterTest,
+    C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
 
+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_c));
+
+INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_c));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VPxPostProcessingFilterTest,
+    SSE2, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_sse2));
 #endif
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
-    MSA, VPxPostProcessingFilterTest,
+    MSA, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_msa));
 #endif
 
 }  // namespace
diff --git a/test/stress.sh b/test/stress.sh
new file mode 100755
index 000000000..1426194a5
--- /dev/null
+++ b/test/stress.sh
@@ -0,0 +1,141 @@
+#!/bin/sh
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This file performs a stress test. It runs 5 encodes and 30 decodes in
+##  parallel.
+
+. $(dirname $0)/tools_common.sh
+
+YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv"
+VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm"
+VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm"
+DATA_URL="http://downloads.webmproject.org/test_data/libvpx/"
+SHA1_FILE="$(dirname $0)/test-data.sha1"
+
+# Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is
+# cribbed from libs.mk.
+[ -x "$(which sha1sum)" ] && sha1sum=sha1sum
+[ -x "$(which shasum)" ] && sha1sum=shasum
+[ -x "$(which sha1)" ] && sha1sum=sha1
+
+# Download a file from the url and check its sha1sum.
+download_and_check_file() {
+  # Get the file from the file path.
+  local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}"
+
+  # Download the file using curl. Trap to insure non partial file.
+  (trap "rm -f $1" INT TERM \
+    && eval "curl -L -o $1 ${DATA_URL}${root} ${devnull}")
+
+  # Check the sha1 sum of the file.
+  if [ -n "${sha1sum}" ]; then
+    set -e
+    grep ${root} ${SHA1_FILE} \
+      | (cd ${LIBVPX_TEST_DATA_PATH}; ${sha1sum} -c);
+  fi
+}
+
+# Environment check: Make sure input is available.
+stress_verify_environment() {
+  if [ ! -e "${SHA1_FILE}" ] ; then
+    echo "Missing ${SHA1_FILE}"
+    return 1
+  fi
+  for file in "${YUV}" "${VP8}" "${VP9}"; do
+    if [ ! -e "${file}" ] ; then
+      download_and_check_file "${file}"
+    fi
+  done
+  if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then
+    elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxenc)" ]; then
+    elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+  if [ -z "$(vpx_tool_path vpxdec)" ]; then
+    elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# This function runs tests on libvpx that run multiple encodes and decodes
+# in parallel in hopes of catching synchronization and/or threading issues.
+stress() {
+  local readonly decoder="$(vpx_tool_path vpxdec)"
+  local readonly encoder="$(vpx_tool_path vpxenc)"
+  local readonly codec="$1"
+  local readonly webm="$2"
+  local readonly decode_count="$3"
+  local pids=""
+  local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5}
+  local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5}
+
+  # Enable job control, so we can run multiple processes.
+  set -m
+
+  # Start $twopass_max_jobs encode jobs in parallel.
+  for i in $(seq ${twopass_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \
+      "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.webm" \
+      ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $rt_max_jobs rt encode jobs in parallel.
+  for i in $(seq ${rt_max_jobs}); do
+    bitrate=$(($i * 20 + 300))
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \
+      "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \
+      "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \
+      "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \
+      "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \
+      "--overshoot-pct=15 --buf-sz=1000 --buf-initial-sz=500" \
+      "--buf-optimal-sz=600 --max-intra-rate=900 --resize-allowed=0" \
+      "--drop-frame=0 --passes=1 --rt --noise-sensitivity=4" \
+      "-o ${VPX_TEST_OUTPUT_DIR}/${i}.rt.webm" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Start $decode_count decode jobs in parallel.
+  for i in $(seq "${decode_count}"); do
+    eval "${decoder}" "-t 4" "${webm}" "--noblit" ${devnull} &
+    pids="${pids} $!"
+  done
+
+  # Wait for all parallel jobs to finish.
+  fail=0
+  for job in "${pids}"; do
+    wait $job || fail=$(($fail + 1))
+  done
+  return $fail
+}
+
+vp8_stress_test() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    stress vp8 "${VP8}" "${vp8_max_jobs}"
+  fi
+}
+
+vp9_stress_test() {
+  local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
+
+  if [ "$(vp9_decode_available)" = "yes" -a \
+       "$(vp9_encode_available)" = "yes" ]; then
+    stress vp9 "${VP9}" "${vp9_max_jobs}"
+  fi
+}
+
+run_tests stress_verify_environment "vp8_stress_test vp9_stress_test"
diff --git a/test/test-data.mk b/test/test-data.mk
index e528c9182..ebf0cd8c2 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -869,3 +869,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index eda46c918..7388ddb1a 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -842,3 +842,6 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
 70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
+7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf
+02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5
+6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
diff --git a/test/test.mk b/test/test.mk
index 60218a780..e25463e46 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -35,6 +35,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index a78123d3d..17dde1a52 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -478,6 +478,43 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
                        vpx_highbd_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
+    vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
+    vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
+    vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
+    vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_4x4_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
+    vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
+    vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
+    vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
+    vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL,
+    vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_neon,
+                       vpx_highbd_dc_left_predictor_16x16_neon,
+                       vpx_highbd_dc_top_predictor_16x16_neon,
+                       vpx_highbd_dc_128_predictor_16x16_neon,
+                       vpx_highbd_v_predictor_16x16_neon,
+                       vpx_highbd_h_predictor_16x16_neon,
+                       vpx_highbd_d45_predictor_16x16_neon,
+                       vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_neon,
+                       vpx_highbd_dc_left_predictor_32x32_neon,
+                       vpx_highbd_dc_top_predictor_32x32_neon,
+                       vpx_highbd_dc_128_predictor_32x32_neon,
+                       vpx_highbd_v_predictor_32x32_neon,
+                       vpx_highbd_h_predictor_32x32_neon,
+                       vpx_highbd_d45_predictor_32x32_neon,
+                       vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL,
+                       NULL, vpx_highbd_tm_predictor_32x32_neon)
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #include "test/test_libvpx.cc"
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 460c1f51b..def78da28 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -373,7 +373,9 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-20-big_superframe-02.webm",
   RESIZE_TEST_VECTORS
 };
+const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
+const int kNumVP9TestVectorsSvc = NELEMENTS(kVP9TestVectorsSvc);
 const char *const kVP9TestVectorsResize[] = { RESIZE_TEST_VECTORS };
 const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
 #undef RESIZE_TEST_VECTORS
diff --git a/test/test_vectors.h b/test/test_vectors.h
index 2c6918abd..3df3e8113 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[];
 #if CONFIG_VP9_DECODER
 extern const int kNumVP9TestVectors;
 extern const char *const kVP9TestVectors[];
+extern const int kNumVP9TestVectorsSvc;
+extern const char *const kVP9TestVectorsSvc[];
 extern const int kNumVP9TestVectorsResize;
 extern const char *const kVP9TestVectorsResize[];
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index f89f852b6..4df40854b 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -124,11 +124,31 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
   ASSERT_EQ(single_thr_md5, multi_thr_md5);
 }
 
-VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
-                          ::testing::Values(::libvpx_test::kTwoPassGood,
-                                            ::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9),   // cpu_used
-                          ::testing::Range(0, 3),   // tile_columns
-                          ::testing::Range(2, 5));  // threads
+// Split this into two instantiations so that we can distinguish
+// between very slow runs ( ie cpu_speed 0 ) vs ones that can be
+// run nightly by adding Large to the title.
+INSTANTIATE_TEST_CASE_P(
+    VP9, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(2, 9),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
+INSTANTIATE_TEST_CASE_P(
+    VP9Large, VPxEncoderThreadTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
+        ::testing::Values(::libvpx_test::kTwoPassGood,
+                          ::libvpx_test::kOnePassGood,
+                          ::libvpx_test::kRealTime),
+        ::testing::Range(0, 2),    // cpu_used
+        ::testing::Range(0, 3),    // tile_columns
+        ::testing::Range(2, 5)));  // threads
+
 }  // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index a715fc6f2..0819345b8 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -420,5 +420,235 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_v_predictor_32x32_c, 32, 12)));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_neon,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_neon,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_neon,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_neon,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_neon,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_neon,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_neon,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_neon,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_neon,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_neon,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_neon,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_neon,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_neon,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_neon,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_neon,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_neon,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_neon,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_neon,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/vpx_temporal_svc_encoder.sh b/test/vpx_temporal_svc_encoder.sh
index c91f9cc56..6b6d15e7f 100755
--- a/test/vpx_temporal_svc_encoder.sh
+++ b/test/vpx_temporal_svc_encoder.sh
@@ -40,7 +40,7 @@ vpx_tsvc_encoder() {
   local timebase_den="1000"
   local speed="6"
   local frame_drop_thresh="30"
-  local threads="1"
+  local max_threads="4"
 
   shift 2
 
@@ -49,11 +49,14 @@ vpx_tsvc_encoder() {
     return 1
   fi
 
-  eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-      "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-      "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-      "${threads}" "$@" \
-      ${devnull}
+  # TODO(tomfinegan): Verify file output for all thread runs.
+  for threads in $(seq $max_threads); do
+    eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
+        "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
+        "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
+        "${threads}" "$@" \
+        ${devnull}
+  done
 }
 
 # Confirms that all expected output files exist given the output file name
diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c
index 7612024b7..2de343419 100644
--- a/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/vp8/common/mips/dspr2/filter_dspr2.c
@@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
   unsigned char src_ptr_r2;
   unsigned char src_ptr_r3;
   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+  (void)output_width;
 
   vector4a = 64;
 
diff --git a/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index b79af1cc8..d2c344251 100644
--- a/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   /* loop filter designed to work using chars so that we can make maximum use
    * of 8 bit simd instructions.
@@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   uint32_t mask, hev;
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  (void)count;
 
   mask = 0;
   hev = 0;
@@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   unsigned char *s1, *s2, *s3, *s4;
   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  (void)count;
 
   mask = 0;
   hev = 0;
diff --git a/vp8/common/ppflags.h b/vp8/common/ppflags.h
index 713f5dffe..96e3af6c9 100644
--- a/vp8/common/ppflags.h
+++ b/vp8/common/ppflags.h
@@ -19,14 +19,7 @@ enum {
   VP8D_DEBLOCK = 1 << 0,
   VP8D_DEMACROBLOCK = 1 << 1,
   VP8D_ADDNOISE = 1 << 2,
-  VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP8D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP8D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP8D_DEBUG_DRAW_MV = 1 << 7,
-  VP8D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP8D_MFQE = 1 << 10
+  VP8D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index c0e95b15a..bc5e05799 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -210,8 +210,9 @@ $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
 $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
 
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_refining_search_sad sse3/;
-$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+specialize qw/vp8_refining_search_sad sse2 msa/;
+$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
+$vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_diamond_search_sad sse2 msa/;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index c6a39f85c..e3a088e28 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -331,8 +331,8 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
-  } else if (eob <= 10) {
-    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else if (eob <= 12) {
+    vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index 6dcfa412b..b8b647bf1 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -20,14 +20,7 @@ enum {
   VP9D_DEBLOCK = 1 << 0,
   VP9D_DEMACROBLOCK = 1 << 1,
   VP9D_ADDNOISE = 1 << 2,
-  VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3,
-  VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4,
-  VP9D_DEBUG_TXT_DC_DIFF = 1 << 5,
-  VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
-  VP9D_DEBUG_DRAW_MV = 1 << 7,
-  VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
-  VP9D_MFQE = 1 << 10
+  VP9D_MFQE = 1 << 3
 };
 
 typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index fafc65983..abef06763 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -137,6 +137,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant ssse3/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2 msa sse2/;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 072d92e4e..862f47288 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -383,13 +383,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
           : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
   // More aggressive settings for noisy content.
   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) {
-    consec_zero_mv_thresh = 80;
+    consec_zero_mv_thresh = 60;
     qindex_thresh =
         VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
-               7 * cm->base_qindex >> 3);
+               cm->base_qindex);
   }
   do {
     int sum_map = 0;
+    int consec_zero_mv_thresh_block = consec_zero_mv_thresh;
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
@@ -403,6 +404,9 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
     ymis =
         VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+        (xmis <= 2 || ymis <= 2))
+      consec_zero_mv_thresh_block = 10;
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
@@ -412,7 +416,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
         if (cr->map[bl_index2] == 0) {
           count_tot++;
           if (cr->last_coded_q_map[bl_index2] > qindex_thresh ||
-              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) {
+              cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
             sum_map++;
             count_sel++;
           }
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3ab05375f..7e8727411 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -742,9 +742,12 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
           continue;
 
         if ((*this_mi)->sb_type == BLOCK_32X32) {
-          if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1))
+          int64_t threshold_32x32 = cpi->sf.short_circuit_low_temp_var == 3
+                                        ? ((3 * thresholds[1]) >> 2)
+                                        : (thresholds[1] >> 1);
+          if (vt->split[i].part_variances.none.variance < threshold_32x32)
             x->variance_low[i + 5] = 1;
-        } else if (cpi->sf.short_circuit_low_temp_var == 2) {
+        } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
           // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
           // inside.
           if ((*this_mi)->sb_type == BLOCK_16X16 ||
@@ -834,8 +837,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
-  threshold_4x4avg =
-      (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1;
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
 
   memset(x->variance_low, 0, sizeof(x->variance_low));
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 594d10ab8..600ebec9c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3218,6 +3218,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   vpx_clear_system_state();
 }
 
+#define MAX_QSTEP_ADJ 4
+static int get_qstep_adj(int rate_excess, int rate_limit) {
+  int qstep =
+      rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX;
+  return VPXMIN(qstep, MAX_QSTEP_ADJ);
+}
+
 static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                     uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
@@ -3231,9 +3238,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  int enable_acl;
 
   set_size_independent_vars(cpi);
 
+  enable_acl = cpi->sf.allow_acl
+                   ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
+                   : 0;
+
   do {
     vpx_clear_system_state();
 
@@ -3328,7 +3340,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
-      restore_coding_context(cpi);
 
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
@@ -3391,6 +3402,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // to attempt to recode.
         int last_q = q;
         int retries = 0;
+        int qstep;
 
         if (cpi->resize_pending == 1) {
           // Change in frame size so go back around the recode loop.
@@ -3416,7 +3428,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
             q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          qstep =
+              get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
+          q_low = VPXMIN(q + qstep, q_high);
+          // q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3441,7 +3456,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          qstep =
+              get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
+          q_high = VPXMAX(q - qstep, q_low);
+          // q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
@@ -3491,7 +3509,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       ++cpi->tot_recode_hits;
 #endif
     }
+
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
+      if (loop || !enable_acl) restore_coding_context(cpi);
   } while (loop);
+
+  if (enable_acl) {
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+    restore_coding_context(cpi);
+    vp9_pack_bitstream(cpi, dest, size);
+
+    vp9_encode_frame(cpi);
+    vpx_clear_system_state();
+
+    restore_coding_context(cpi);
+  }
 }
 
 static int get_ref_frame_flags(const VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index ce9007ac3..8b84c3825 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1490,6 +1490,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->sf.short_circuit_low_temp_var) {
     force_skip_low_temp_var =
         get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+    // In the most aggresive short circuit, skip golden in any mode
+    if (cpi->sf.short_circuit_low_temp_var == 3 && force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
   }
 
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
@@ -1558,7 +1562,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       continue;
     }
 
-    if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var &&
+    if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var &&
         ref_frame == LAST_FRAME && this_mode == NEWMV) {
       continue;
     }
@@ -1573,7 +1577,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         !(frame_mv[this_mode][ref_frame].as_int == 0 &&
           ref_frame == LAST_FRAME)) {
       if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var) {
+        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
           i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
           if ((cpi->ref_frame_flags & flag_list[i]))
             if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3e1ed50a6..f500f2f98 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -182,6 +182,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->mv.subpel_iters_per_step = 1;
     sf->mode_skip_start = 10;
     sf->adaptive_pred_interp_filter = 1;
+    sf->allow_acl = 0;
 
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
@@ -309,6 +310,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
   sf->use_fast_coef_costing = 1;
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
+  sf->allow_acl = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -506,6 +508,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed,
         content != VP9E_CONTENT_SCREEN) {
       // More aggressive short circuit for speed 8.
       sf->short_circuit_low_temp_var = 2;
+      // More aggressive short circuit for low resolution
+      if (cm->width <= 352 && cm->height <= 288) {
+        sf->short_circuit_low_temp_var = 3;
+      }
     }
     sf->limit_newmv_early_exit = 0;
   }
@@ -592,6 +598,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->tx_domain_thresh = 99.0;
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
   sf->quant_opt_thresh = 99.0;
+  sf->allow_acl = 1;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 6d0b9420a..b63ca6cbb 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -244,6 +244,10 @@ typedef struct SPEED_FEATURES {
   int allow_quant_coeff_opt;
   double quant_opt_thresh;
 
+  // Enable asymptotic closed-loop encoding decision for key frame and
+  // alternate reference frames.
+  int allow_acl;
+
   // Use transform domain distortion. Use pixel domain distortion in speed 0
   // and certain situations in higher speed to improve the RD model precision.
   int allow_txfm_domain_distortion;
@@ -457,6 +461,7 @@ typedef struct SPEED_FEATURES {
   // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
   // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
   // 32x16.
+  // 3: Same as (2), also skip golden in any mode for low res
   int short_circuit_low_temp_var;
 
   // Limits the rd-threshold update for early exit for the newmv-last mode,
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index fb2a92541..b3c3d7beb 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -12,14 +12,17 @@
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fdct.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
 void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
     int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
     const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
   __m128i zero;
   int pass;
@@ -328,15 +331,15 @@ void vp9_fdct8x8_quant_ssse3(
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -398,20 +401,21 @@ void vp9_fdct8x8_quant_ssse3(
           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
         } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+          // Maybe a more efficient way to store 0?
+          store_zero_tran_low(qcoeff_ptr + n_coeffs);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
         }
       }
 
@@ -452,10 +456,10 @@ void vp9_fdct8x8_quant_ssse3(
     }
   } else {
     do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index a797b2c26..4e8e40440 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -157,6 +157,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode);
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2);
   RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1);
   RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4a1ebbc8c..0a3e84a0d 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -553,6 +553,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                    ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK) return res;
 
+  if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
+    frame_count = ctx->svc_spatial_layer + 1;
+
   if (ctx->frame_parallel_decode) {
     // Decode in frame parallel mode. When decoding in this mode, the frame
     // passed to the decoder must be either a normal frame or a superframe with
@@ -1001,6 +1004,16 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->svc_decoding = 1;
+  ctx->svc_spatial_layer = va_arg(args, int);
+  if (ctx->svc_spatial_layer < 0)
+    return VPX_CODEC_INVALID_PARAM;
+  else
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1011,6 +1024,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
 
   // Getters
   { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
diff --git a/vp9/vp9_dx_iface.h b/vp9/vp9_dx_iface.h
index cc3d51842..c1559599b 100644
--- a/vp9/vp9_dx_iface.h
+++ b/vp9/vp9_dx_iface.h
@@ -60,6 +60,10 @@ struct vpx_codec_alg_priv {
   void *ext_priv;  // Private data associated with the external frame buffers.
   vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
   vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+  // Allow for decoding up to a given spatial layer for SVC stream.
+  int svc_decoding;
+  int svc_spatial_layer;
 };
 
 #endif  // VP9_VP9_DX_IFACE_H_
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 88204acd3..0d7759eb2 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -111,6 +111,11 @@ enum vp8_dec_control_id {
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
+  /** control function to decode SVC stream up to the x spatial layers,
+   * where x is passed in through the control, and is 0 for base layer.
+   */
+  VP9_DECODE_SVC_SPATIAL_LAYER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -162,6 +167,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 000000000..6f7e5da76
--- /dev/null
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,1078 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+  const uint16x4_t ref_u16 = vld1_u16(ref);
+  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
+  return vpadd_u16(p0, p0);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x4_t a = vld1_u16(above);
+  const uint16x4_t l = vld1_u16(left);
+  uint16x4_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  sum = vadd_u16(a, l);
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 3);
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+  const uint16x8_t ref_u16 = vld1q_u16(ref);
+  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const uint16x4_t dc) {
+  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const uint16x8_t above_u16 = vld1q_u16(above);
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vrshr_n_u16(sum, 4);
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst2q_u16(dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x2_t a = vld2q_u16(above);
+  const uint16x8x2_t l = vld2q_u16(left);
+  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum;
+  uint16x4_t dc;
+  (void)bd;
+  pal1 = vpadd_u16(pal1, pal1);
+  sum = vpaddl_u16(pal1);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8x4_t r = vld4q_u16(ref);
+  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
+  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpaddl_u16(sum);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const uint16x4_t dc) {
+  uint16x8x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+
+  for (i = 0; i < 32; ++i) {
+    vst2q_u16(dst, dc_dup);
+    dst += 16;
+    vst2q_u16(dst, dc_dup);
+    dst += stride - 16;
+  }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8x4_t a = vld4q_u16(above);
+  const uint16x8x4_t l = vld4q_u16(left);
+  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
+  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
+  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal0 = vaddq_u16(pa, pl);
+  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+  uint32x2_t sum = vpaddl_u16(pal1);
+  uint16x4_t dc;
+  (void)bd;
+  sum = vpadd_u32(sum, sum);
+  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(left);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint32x2_t sum = dc_sum_32(above);
+  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t ABCDEFGH = vld1q_u16(above);
+  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+  const uint16x4_t avg2_low = vget_low_u16(avg2);
+  const uint16x4_t avg2_high = vget_high_u16(avg2);
+  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  (void)left;
+  (void)bd;
+  vst1_u16(dst, avg2_low);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, r3);
+  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               const uint16x8_t above_right, uint16x8_t *row) {
+  *row = vextq_u16(*row, above_right, 1);
+  vst1q_u16(*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8_t A0 = vld1q_u16(above);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+  const uint16x8_t A1 = vld1q_u16(above + 1);
+  const uint16x8_t A2 = vld1q_u16(above + 2);
+  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                const uint16x8_t above_right, uint16x8_t *row_0,
+                                uint16x8_t *row_1) {
+  *row_0 = vextq_u16(*row_0, *row_1, 1);
+  *row_1 = vextq_u16(*row_1, above_right, 1);
+  vst1q_u16(*dst, *row_0);
+  *dst += 8;
+  vst1q_u16(*dst, *row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  vst1q_u16(dst + 8, row_1);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+  vst1q_u16(dst, above_right);
+  vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const uint16x8_t A0_0 = vld1q_u16(above);
+  const uint16x8_t A0_1 = vld1q_u16(above + 8);
+  const uint16x8_t A0_2 = vld1q_u16(above + 16);
+  const uint16x8_t A0_3 = vld1q_u16(above + 24);
+  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+  const uint16x8_t A1_0 = vld1q_u16(above + 1);
+  const uint16x8_t A1_1 = vld1q_u16(above + 9);
+  const uint16x8_t A1_2 = vld1q_u16(above + 17);
+  const uint16x8_t A1_3 = vld1q_u16(above + 25);
+  const uint16x8_t A2_0 = vld1q_u16(above + 2);
+  const uint16x8_t A2_1 = vld1q_u16(above + 10);
+  const uint16x8_t A2_2 = vld1q_u16(above + 18);
+  const uint16x8_t A2_3 = vld1q_u16(above + 26);
+  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  int i;
+  (void)left;
+  (void)bd;
+
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+  dst += 8;
+  vst1q_u16(dst, row_2);
+  dst += 8;
+  vst1q_u16(dst, row_3);
+  dst += stride - 24;
+
+  for (i = 0; i < 30; ++i) {
+    row_0 = vextq_u16(row_0, row_1, 1);
+    row_1 = vextq_u16(row_1, row_2, 1);
+    row_2 = vextq_u16(row_2, row_3, 1);
+    row_3 = vextq_u16(row_3, above_right, 1);
+    vst1q_u16(dst, row_0);
+    dst += 8;
+    vst1q_u16(dst, row_1);
+    dst += 8;
+    vst1q_u16(dst, row_2);
+    dst += 8;
+    vst1q_u16(dst, row_3);
+    dst += stride - 24;
+  }
+
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+  dst += 8;
+  vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+  const uint16x4_t L0123 = vld1_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(L0123);
+  const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+  const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+  const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+  const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+  const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+  const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+  const uint16x4_t row_0 = vget_low_u16(avg2);
+  const uint16x4_t row_1 = vget_high_u16(avg2);
+  const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+  const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+  const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1_u16(dst, r0);
+  dst += stride;
+  vst1_u16(dst, r1);
+  dst += stride;
+  vst1_u16(dst, r2);
+  dst += stride;
+  vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+  const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+  vst1q_u16(dst, r0);
+  dst += stride;
+  vst1q_u16(dst, r1);
+  dst += stride;
+  vst1q_u16(dst, r2);
+  dst += stride;
+  vst1q_u16(dst, r3);
+  dst += stride;
+  vst1q_u16(dst, r4);
+  dst += stride;
+  vst1q_u16(dst, r5);
+  dst += stride;
+  vst1q_u16(dst, r6);
+  dst += stride;
+  vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                 const uint16x8_t row_0,
+                                 const uint16x8_t row_1) {
+  vst1q_u16(*dst, row_0);
+  *dst += 8;
+  vst1q_u16(*dst, row_1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t L01234567 = vld1q_u16(left);
+  const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+  const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+  const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+  const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+  const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+  const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+  const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+  const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+  const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+  const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+  const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+  const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+  const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+  const uint16x8_t A01234567 = vld1q_u16(above);
+  const uint16x8_t A12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+  const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+  const uint16x8_t A789abcde = vld1q_u16(above + 7);
+  const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+  const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+  const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+  const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+  const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+  const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+  const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+  const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+  const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+  const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+  const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+  const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+  const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+  const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+  const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+  const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+  const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+  const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+  const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+  const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+  const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+  const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+  const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+  const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+  (void)bd;
+
+  d135_store_16(&dst, stride, r0_0, r0_1);
+  d135_store_16(&dst, stride, r1_0, r1_1);
+  d135_store_16(&dst, stride, r2_0, r2_1);
+  d135_store_16(&dst, stride, r3_0, r3_1);
+  d135_store_16(&dst, stride, r4_0, r4_1);
+  d135_store_16(&dst, stride, r5_0, r5_1);
+  d135_store_16(&dst, stride, r6_0, r6_1);
+  d135_store_16(&dst, stride, row_1, row_2);
+  d135_store_16(&dst, stride, r8_0, r0_0);
+  d135_store_16(&dst, stride, r9_0, r1_0);
+  d135_store_16(&dst, stride, ra_0, r2_0);
+  d135_store_16(&dst, stride, rb_0, r3_0);
+  d135_store_16(&dst, stride, rc_0, r4_0);
+  d135_store_16(&dst, stride, rd_0, r5_0);
+  d135_store_16(&dst, stride, re_0, r6_0);
+  vst1q_u16(dst, row_0);
+  dst += 8;
+  vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+  const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+  const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+  const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+  const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+  const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+  const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+  const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+  const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+  const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+  const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+  uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+  const uint16x8_t LU01234567 = vld1q_u16(left);
+  const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+  const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+  const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+  const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+  const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+  const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+  const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+  const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+  const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+  const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+  uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+  const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+  const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+  const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+  uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+  const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+  const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+  const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+  const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+  uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+  const uint16x8_t AL01234567 = vld1q_u16(above);
+  const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+  const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+  uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+  const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+  const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+  const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+  const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+  uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+  const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+  const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+  const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+  const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+  uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+  const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+  const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+  const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+  const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+  uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+  int i, j;
+  (void)bd;
+
+  dst += 31 * stride;
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 8; ++j) {
+      vst1q_u16(dst, row_0);
+      dst += 8;
+      vst1q_u16(dst, row_1);
+      dst += 8;
+      vst1q_u16(dst, row_2);
+      dst += 8;
+      vst1q_u16(dst, row_3);
+      dst -= stride + 24;
+      row_0 = vextq_u16(row_0, row_1, 1);
+      row_1 = vextq_u16(row_1, row_2, 1);
+      row_2 = vextq_u16(row_2, row_3, 1);
+      row_3 = vextq_u16(row_3, row_4, 1);
+      row_4 = vextq_u16(row_4, row_4, 1);
+    }
+    row_4 = row_5;
+    row_5 = row_6;
+    row_6 = row_7;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t row = vld1_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, dst += stride) {
+    vst1_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t row = vld1q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row = vld2q_u16(above);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst2q_u16(dst, row);
+  }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const uint16x8x2_t row0 = vld2q_u16(above);
+  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  int i;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < 32; i++) {
+    vst2q_u16(dst, row0);
+    dst += 16;
+    vst2q_u16(dst, row1);
+    dst += stride - 16;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x4_t left_u16 = vld1_u16(left);
+  uint16x4_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdup_lane_u16(left_u16, 0);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 1);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 2);
+  vst1_u16(dst, row);
+  dst += stride;
+  row = vdup_lane_u16(left_u16, 3);
+  vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const uint16x8_t left_u16 = vld1q_u16(left);
+  const uint16x4_t left_low = vget_low_u16(left_u16);
+  const uint16x4_t left_high = vget_high_u16(left_u16);
+  uint16x8_t row;
+  (void)above;
+  (void)bd;
+
+  row = vdupq_lane_u16(left_low, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_low, 3);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 0);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 1);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 2);
+  vst1q_u16(dst, row);
+  dst += stride;
+  row = vdupq_lane_u16(left_high, 3);
+  vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_16(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_16(&dst, stride, row);
+  }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+                              const uint16x8_t row) {
+  // Note: vst1q is faster than vst2q
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += 8;
+  vst1q_u16(*dst, row);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const uint16x8_t left_u16q = vld1q_u16(left);
+    const uint16x4_t left_low = vget_low_u16(left_u16q);
+    const uint16x4_t left_high = vget_high_u16(left_u16q);
+    uint16x8_t row;
+
+    row = vdupq_lane_u16(left_low, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_low, 3);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 0);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 1);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 2);
+    h_store_32(&dst, stride, row);
+    row = vdupq_lane_u16(left_high, 3);
+    h_store_32(&dst, stride, row);
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+  const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+  const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x8_t sum;
+  uint16x8_t row;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1_u16(dst, vget_low_u16(row));
+  dst += stride;
+  vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub,
+                               const int16x8_t max) {
+  uint16x8_t row;
+  int16x8_t sum = vaddq_s16(left_dup, sub);
+  sum = vminq_s16(sum, max);
+  row = vqshluq_n_s16(sum, 0);
+  vst1q_u16(*dst, row);
+  *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+  const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+  const int16x8_t sub = vsubq_s16(above_s16, top_left);
+  int16x4_t left_s16d;
+  int16x8_t left_dup;
+  int i;
+
+  left_s16d = vget_low_s16(left_s16);
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub, max);
+  }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t max) {
+  uint16x8_t row0, row1;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 2; j++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+    }
+  }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3, const int16x8_t max) {
+  uint16x8_t row0, row1, row2, row3;
+  int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  sum0 = vminq_s16(sum0, max);
+  sum1 = vminq_s16(sum1, max);
+  sum2 = vminq_s16(sum2, max);
+  sum3 = vminq_s16(sum3, max);
+  row0 = vqshluq_n_s16(sum0, 0);
+  row1 = vqshluq_n_s16(sum1, 0);
+  row2 = vqshluq_n_s16(sum2, 0);
+  row3 = vqshluq_n_s16(sum3, 0);
+  vst1q_u16(*dst, row0);
+  *dst += 8;
+  vst1q_u16(*dst, row1);
+  *dst += 8;
+  vst1q_u16(*dst, row2);
+  *dst += 8;
+  vst1q_u16(*dst, row3);
+  *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+  const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+  const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+  const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+  const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+  const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+  const int16x8_t sub0 = vsubq_s16(above0, top_left);
+  const int16x8_t sub1 = vsubq_s16(above1, top_left);
+  const int16x8_t sub2 = vsubq_s16(above2, top_left);
+  const int16x8_t sub3 = vsubq_s16(above3, top_left);
+  int16x8_t left_dup;
+  int i, j;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+    }
+  }
+}
diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 000000000..db9ffef6c
--- /dev/null
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,686 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
+//  0   0   2   5  10  17  25  38  47  62  83 101 121
+//  1   1   4   8  15  22  30  45  58  74  92 112 133
+//  2   3   7  12  18  28  36  52  64  82 102 118
+//  3   6  11  16  23  31  43  60  73  90 109 126
+//  4   9  14  19  29  37  50  65  78  98 116 134
+//  5  13  20  26  35  44  54  72  85 105 123
+//  6  21  27  33  42  53  63  80  94 113 132
+//  7  24  32  39  48  57  71  88 104 120
+//  8  34  40  46  56  68  81  96 111 130
+//  9  41  49  55  67  77  91 107 124
+// 10  51  59  66  76  89  99 119 131
+// 11  61  69  75  87 100 114 129
+// 12  70  79  86  97 108 122
+// 13  84  93 103 110 125
+// 14  98 106 115 127
+// 15 117 128
+static void idct32_12_neon(const int16_t *input, int16_t *output) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t in8, in9, in10, in11;
+  int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
+      s1_28, s1_29, s1_31;
+  int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
+      s2_26, s2_27, s2_28, s2_29;
+  int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
+      s3_25, s3_26, s3_29, s3_30;
+  int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
+      s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
+      s4_29, s4_30, s4_31;
+  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+      s5_29;
+  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+      s7_25, s7_26, s7_27;
+
+  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
+
+  input += 8;
+
+  tmp0 = vld1_s16(input);
+  input += 32;
+  tmp1 = vld1_s16(input);
+  input += 32;
+  tmp2 = vld1_s16(input);
+  input += 32;
+  tmp3 = vld1_s16(input);
+  input += 32;
+  tmp4 = vld1_s16(input);
+  input += 32;
+  tmp5 = vld1_s16(input);
+  input += 32;
+  tmp6 = vld1_s16(input);
+  input += 32;
+  tmp7 = vld1_s16(input);
+
+  transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
+                    &in10, &in11);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  s2_18 = vsubq_s16(s1_19, s1_18);
+  s2_19 = vaddq_s16(s1_18, s1_19);
+  s2_20 = vaddq_s16(s1_20, s1_21);
+  s2_21 = vsubq_s16(s1_20, s1_21);
+  s2_26 = vsubq_s16(s1_27, s1_26);
+  s2_27 = vaddq_s16(s1_26, s1_27);
+  s2_28 = vaddq_s16(s1_28, s1_29);
+  s2_29 = vsubq_s16(s1_28, s1_29);
+
+  // stage 3
+  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s3_10 = vsubq_s16(s2_11, s2_10);
+  s3_11 = vaddq_s16(s2_10, s2_11);
+  s3_12 = vaddq_s16(s2_12, s2_13);
+  s3_13 = vsubq_s16(s2_12, s2_13);
+
+  s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+                                                   -cospi_4_64);
+  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+                                                   cospi_28_64);
+
+  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+                                                   cospi_12_64);
+  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+                                                   cospi_20_64);
+
+  s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+  s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+                                                   -cospi_8_64);
+  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+                                                   cospi_24_64);
+
+  s4_16 = vaddq_s16(s1_16, s2_19);
+  s4_17 = vaddq_s16(s3_17, s3_18);
+  s4_18 = vsubq_s16(s3_17, s3_18);
+  s4_19 = vsubq_s16(s1_16, s2_19);
+  s4_20 = vsubq_s16(s1_23, s2_20);
+  s4_21 = vsubq_s16(s3_22, s3_21);
+  s4_22 = vaddq_s16(s3_21, s3_22);
+  s4_23 = vaddq_s16(s2_20, s1_23);
+  s4_24 = vaddq_s16(s1_24, s2_27);
+  s4_25 = vaddq_s16(s3_25, s3_26);
+  s4_26 = vsubq_s16(s3_25, s3_26);
+  s4_27 = vsubq_s16(s1_24, s2_27);
+  s4_28 = vsubq_s16(s1_31, s2_28);
+  s4_29 = vsubq_s16(s3_30, s3_29);
+  s4_30 = vaddq_s16(s3_29, s3_30);
+  s4_31 = vaddq_s16(s2_28, s1_31);
+
+  // stage 5
+  s5_0 = vaddq_s16(s4_0, s4_3);
+  s5_1 = vaddq_s16(s4_0, s4_2);
+  s5_2 = vsubq_s16(s4_0, s4_2);
+  s5_3 = vsubq_s16(s4_0, s4_3);
+
+  s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
+  s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+
+  s5_8 = vaddq_s16(s2_8, s3_11);
+  s5_9 = vaddq_s16(s4_9, s4_10);
+  s5_10 = vsubq_s16(s4_9, s4_10);
+  s5_11 = vsubq_s16(s2_8, s3_11);
+  s5_12 = vsubq_s16(s2_15, s3_12);
+  s5_13 = vsubq_s16(s4_14, s4_13);
+  s5_14 = vaddq_s16(s4_13, s4_14);
+  s5_15 = vaddq_s16(s2_15, s3_12);
+
+  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+                                                   cospi_24_64);
+  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+                                                   cospi_8_64);
+
+  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+                                                   cospi_24_64);
+  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+                                                   cospi_8_64);
+
+  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+                                                   -cospi_8_64);
+  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+                                                   cospi_24_64);
+
+  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+                                                   -cospi_8_64);
+  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s6_0 = vaddq_s16(s5_0, s3_7);
+  s6_1 = vaddq_s16(s5_1, s5_6);
+  s6_2 = vaddq_s16(s5_2, s5_5);
+  s6_3 = vaddq_s16(s5_3, s3_4);
+  s6_4 = vsubq_s16(s5_3, s3_4);
+  s6_5 = vsubq_s16(s5_2, s5_5);
+  s6_6 = vsubq_s16(s5_1, s5_6);
+  s6_7 = vsubq_s16(s5_0, s3_7);
+
+  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+  s6_16 = vaddq_s16(s4_16, s4_23);
+  s6_17 = vaddq_s16(s4_17, s4_22);
+  s6_18 = vaddq_s16(s5_18, s5_21);
+  s6_19 = vaddq_s16(s5_19, s5_20);
+  s6_20 = vsubq_s16(s5_19, s5_20);
+  s6_21 = vsubq_s16(s5_18, s5_21);
+  s6_22 = vsubq_s16(s4_17, s4_22);
+  s6_23 = vsubq_s16(s4_16, s4_23);
+
+  s6_24 = vsubq_s16(s4_31, s4_24);
+  s6_25 = vsubq_s16(s4_30, s4_25);
+  s6_26 = vsubq_s16(s5_29, s5_26);
+  s6_27 = vsubq_s16(s5_28, s5_27);
+  s6_28 = vaddq_s16(s5_27, s5_28);
+  s6_29 = vaddq_s16(s5_26, s5_29);
+  s6_30 = vaddq_s16(s4_25, s4_30);
+  s6_31 = vaddq_s16(s4_24, s4_31);
+
+  // stage 7
+  s7_0 = vaddq_s16(s6_0, s5_15);
+  s7_1 = vaddq_s16(s6_1, s5_14);
+  s7_2 = vaddq_s16(s6_2, s6_13);
+  s7_3 = vaddq_s16(s6_3, s6_12);
+  s7_4 = vaddq_s16(s6_4, s6_11);
+  s7_5 = vaddq_s16(s6_5, s6_10);
+  s7_6 = vaddq_s16(s6_6, s5_9);
+  s7_7 = vaddq_s16(s6_7, s5_8);
+  s7_8 = vsubq_s16(s6_7, s5_8);
+  s7_9 = vsubq_s16(s6_6, s5_9);
+  s7_10 = vsubq_s16(s6_5, s6_10);
+  s7_11 = vsubq_s16(s6_4, s6_11);
+  s7_12 = vsubq_s16(s6_3, s6_12);
+  s7_13 = vsubq_s16(s6_2, s6_13);
+  s7_14 = vsubq_s16(s6_1, s5_14);
+  s7_15 = vsubq_s16(s6_0, s5_15);
+
+  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+  output += 16;
+
+  vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+  output += 16;
+  vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+  output += 16;
+
+  vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+  output += 16;
+  vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+}
+
+static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+      in13, in14, in15;
+  int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
+      s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
+  int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
+      s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
+      s2_28, s2_29, s2_30, s2_31;
+  int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
+      s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
+  int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
+      s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
+      s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
+  int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+      s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+      s5_29;
+  int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+      s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+      s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+  int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+      s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+      s7_25, s7_26, s7_27;
+  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
+
+  load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
+                             &in13, &in14, &in15);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
+  s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+
+  s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+  s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+  s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+  s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
+  s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
+  s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+
+  s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+  s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  s2_16 = vaddq_s16(s1_16, s1_17);
+  s2_17 = vsubq_s16(s1_16, s1_17);
+  s2_18 = vsubq_s16(s1_19, s1_18);
+  s2_19 = vaddq_s16(s1_18, s1_19);
+  s2_20 = vaddq_s16(s1_20, s1_21);
+  s2_21 = vsubq_s16(s1_20, s1_21);
+  s2_22 = vsubq_s16(s1_23, s1_22);
+  s2_23 = vaddq_s16(s1_22, s1_23);
+  s2_24 = vaddq_s16(s1_24, s1_25);
+  s2_25 = vsubq_s16(s1_24, s1_25);
+  s2_26 = vsubq_s16(s1_27, s1_26);
+  s2_27 = vaddq_s16(s1_26, s1_27);
+  s2_28 = vaddq_s16(s1_28, s1_29);
+  s2_29 = vsubq_s16(s1_28, s1_29);
+  s2_30 = vsubq_s16(s1_31, s1_30);
+  s2_31 = vaddq_s16(s1_30, s1_31);
+
+  // stage 3
+  s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
+  s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+
+  s3_8 = vaddq_s16(s2_8, s2_9);
+  s3_9 = vsubq_s16(s2_8, s2_9);
+  s3_10 = vsubq_s16(s2_11, s2_10);
+  s3_11 = vaddq_s16(s2_10, s2_11);
+  s3_12 = vaddq_s16(s2_12, s2_13);
+  s3_13 = vsubq_s16(s2_12, s2_13);
+  s3_14 = vsubq_s16(s2_15, s2_14);
+  s3_15 = vaddq_s16(s2_14, s2_15);
+
+  s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
+                                                   cospi_28_64);
+  s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
+                                                   cospi_4_64);
+
+  s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+                                                   -cospi_4_64);
+  s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+                                                   cospi_28_64);
+
+  s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+                                                   cospi_12_64);
+  s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+                                                   cospi_20_64);
+
+  s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
+                                                   -cospi_20_64);
+  s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
+                                                   cospi_12_64);
+
+  // stage 4
+  s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+  s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+  s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+  s4_4 = vaddq_s16(s3_4, s3_5);
+  s4_5 = vsubq_s16(s3_4, s3_5);
+  s4_6 = vsubq_s16(s3_7, s3_6);
+  s4_7 = vaddq_s16(s3_6, s3_7);
+
+  s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
+                                                  cospi_24_64);
+  s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
+                                                   cospi_8_64);
+
+  s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+                                                   -cospi_8_64);
+  s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+                                                   cospi_24_64);
+
+  s4_16 = vaddq_s16(s2_16, s2_19);
+  s4_17 = vaddq_s16(s3_17, s3_18);
+  s4_18 = vsubq_s16(s3_17, s3_18);
+  s4_19 = vsubq_s16(s2_16, s2_19);
+  s4_20 = vsubq_s16(s2_23, s2_20);
+  s4_21 = vsubq_s16(s3_22, s3_21);
+  s4_22 = vaddq_s16(s3_21, s3_22);
+  s4_23 = vaddq_s16(s2_20, s2_23);
+  s4_24 = vaddq_s16(s2_24, s2_27);
+  s4_25 = vaddq_s16(s3_25, s3_26);
+  s4_26 = vsubq_s16(s3_25, s3_26);
+  s4_27 = vsubq_s16(s2_24, s2_27);
+  s4_28 = vsubq_s16(s2_31, s2_28);
+  s4_29 = vsubq_s16(s3_30, s3_29);
+  s4_30 = vaddq_s16(s3_29, s3_30);
+  s4_31 = vaddq_s16(s2_28, s2_31);
+
+  // stage 5
+  s5_0 = vaddq_s16(s4_0, s4_3);
+  s5_1 = vaddq_s16(s4_0, s4_2);
+  s5_2 = vsubq_s16(s4_0, s4_2);
+  s5_3 = vsubq_s16(s4_0, s4_3);
+
+  s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
+  s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+
+  s5_8 = vaddq_s16(s3_8, s3_11);
+  s5_9 = vaddq_s16(s4_9, s4_10);
+  s5_10 = vsubq_s16(s4_9, s4_10);
+  s5_11 = vsubq_s16(s3_8, s3_11);
+  s5_12 = vsubq_s16(s3_15, s3_12);
+  s5_13 = vsubq_s16(s4_14, s4_13);
+  s5_14 = vaddq_s16(s4_13, s4_14);
+  s5_15 = vaddq_s16(s3_15, s3_12);
+
+  s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+                                                   cospi_24_64);
+  s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+                                                   cospi_8_64);
+
+  s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+                                                   cospi_24_64);
+  s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+                                                   cospi_8_64);
+
+  s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+                                                   -cospi_8_64);
+  s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+                                                   cospi_24_64);
+
+  s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+                                                   -cospi_8_64);
+  s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s6_0 = vaddq_s16(s5_0, s4_7);
+  s6_1 = vaddq_s16(s5_1, s5_6);
+  s6_2 = vaddq_s16(s5_2, s5_5);
+  s6_3 = vaddq_s16(s5_3, s4_4);
+  s6_4 = vsubq_s16(s5_3, s4_4);
+  s6_5 = vsubq_s16(s5_2, s5_5);
+  s6_6 = vsubq_s16(s5_1, s5_6);
+  s6_7 = vsubq_s16(s5_0, s4_7);
+
+  s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+  s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+  s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+  s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+  s6_16 = vaddq_s16(s4_16, s4_23);
+  s6_17 = vaddq_s16(s4_17, s4_22);
+  s6_18 = vaddq_s16(s5_18, s5_21);
+  s6_19 = vaddq_s16(s5_19, s5_20);
+  s6_20 = vsubq_s16(s5_19, s5_20);
+  s6_21 = vsubq_s16(s5_18, s5_21);
+  s6_22 = vsubq_s16(s4_17, s4_22);
+  s6_23 = vsubq_s16(s4_16, s4_23);
+  s6_24 = vsubq_s16(s4_31, s4_24);
+  s6_25 = vsubq_s16(s4_30, s4_25);
+  s6_26 = vsubq_s16(s5_29, s5_26);
+  s6_27 = vsubq_s16(s5_28, s5_27);
+  s6_28 = vaddq_s16(s5_27, s5_28);
+  s6_29 = vaddq_s16(s5_26, s5_29);
+  s6_30 = vaddq_s16(s4_25, s4_30);
+  s6_31 = vaddq_s16(s4_24, s4_31);
+
+  // stage 7
+  s7_0 = vaddq_s16(s6_0, s5_15);
+  s7_1 = vaddq_s16(s6_1, s5_14);
+  s7_2 = vaddq_s16(s6_2, s6_13);
+  s7_3 = vaddq_s16(s6_3, s6_12);
+  s7_4 = vaddq_s16(s6_4, s6_11);
+  s7_5 = vaddq_s16(s6_5, s6_10);
+  s7_6 = vaddq_s16(s6_6, s5_9);
+  s7_7 = vaddq_s16(s6_7, s5_8);
+  s7_8 = vsubq_s16(s6_7, s5_8);
+  s7_9 = vsubq_s16(s6_6, s5_9);
+  s7_10 = vsubq_s16(s6_5, s6_10);
+  s7_11 = vsubq_s16(s6_4, s6_11);
+  s7_12 = vsubq_s16(s6_3, s6_12);
+  s7_13 = vsubq_s16(s6_2, s6_13);
+  s7_14 = vsubq_s16(s6_1, s5_14);
+  s7_15 = vsubq_s16(s6_0, s5_15);
+
+  s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+  s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+  s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+  s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+  s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+  s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+  s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+  s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+  // final stage
+  out0 = vaddq_s16(s7_0, s6_31);
+  out1 = vaddq_s16(s7_1, s6_30);
+  out2 = vaddq_s16(s7_2, s6_29);
+  out3 = vaddq_s16(s7_3, s6_28);
+  out4 = vaddq_s16(s7_4, s7_27);
+  out5 = vaddq_s16(s7_5, s7_26);
+  out6 = vaddq_s16(s7_6, s7_25);
+  out7 = vaddq_s16(s7_7, s7_24);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+                       stride);
+
+  out0 = vaddq_s16(s7_8, s7_23);
+  out1 = vaddq_s16(s7_9, s7_22);
+  out2 = vaddq_s16(s7_10, s7_21);
+  out3 = vaddq_s16(s7_11, s7_20);
+  out4 = vaddq_s16(s7_12, s6_19);
+  out5 = vaddq_s16(s7_13, s6_18);
+  out6 = vaddq_s16(s7_14, s6_17);
+  out7 = vaddq_s16(s7_15, s6_16);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (8 * stride), stride);
+
+  out0 = vsubq_s16(s7_15, s6_16);
+  out1 = vsubq_s16(s7_14, s6_17);
+  out2 = vsubq_s16(s7_13, s6_18);
+  out3 = vsubq_s16(s7_12, s6_19);
+  out4 = vsubq_s16(s7_11, s7_20);
+  out5 = vsubq_s16(s7_10, s7_21);
+  out6 = vsubq_s16(s7_9, s7_22);
+  out7 = vsubq_s16(s7_8, s7_23);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (16 * stride), stride);
+
+  out0 = vsubq_s16(s7_7, s7_24);
+  out1 = vsubq_s16(s7_6, s7_25);
+  out2 = vsubq_s16(s7_5, s7_26);
+  out3 = vsubq_s16(s7_4, s7_27);
+  out4 = vsubq_s16(s7_3, s6_28);
+  out5 = vsubq_s16(s7_2, s6_29);
+  out6 = vsubq_s16(s7_1, s6_30);
+  out7 = vsubq_s16(s7_0, s6_31);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_135_add_neon(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  int16_t temp[32 * 16];
+  int16_t *t = temp;
+
+  idct32_12_neon(input, temp);
+  idct32_12_neon(input + 32 * 8, temp + 8);
+
+  for (i = 0; i < 32; i += 8) {
+    idct32_16_neon(t, dest, stride);
+    t += (16 * 8);
+    dest += 8;
+  }
+}
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
index ebec9df54..a584b1d9e 100644
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -34,7 +34,7 @@
 // 5 13 20 26
 // 6 21 27 33
 // 7 24 32
-static void idct32_6_neon(const int16_t *input, int16_t *output) {
+static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
   int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
   int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
       s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
@@ -46,8 +46,22 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
       s2_31;
   int16x8_t s3_24, s3_25, s3_26, s3_27;
 
-  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
-                             &in6, &in7);
+  in0 = load_tran_low_to_s16(input);
+  input += 32;
+  in1 = load_tran_low_to_s16(input);
+  input += 32;
+  in2 = load_tran_low_to_s16(input);
+  input += 32;
+  in3 = load_tran_low_to_s16(input);
+  input += 32;
+  in4 = load_tran_low_to_s16(input);
+  input += 32;
+  in5 = load_tran_low_to_s16(input);
+  input += 32;
+  in6 = load_tran_low_to_s16(input);
+  input += 32;
+  in7 = load_tran_low_to_s16(input);
+  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
 
   // stage 1
   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
@@ -503,7 +517,7 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
                        output + (24 * stride), stride);
 }
 
-void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
                                int stride) {
   int i;
   int16_t temp[32 * 8];
diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm
index a223c0b63..f39e8ddd4 100644
--- a/vpx_dsp/arm/idct_neon.asm
+++ b/vpx_dsp/arm/idct_neon.asm
@@ -27,3 +27,4 @@
     vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
     ENDIF
     MEND
+    END
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 0a8607849..fb1fa6b68 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -776,133 +776,98 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_u8(dst, d);
 }
 
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);
+  *dst += stride;
+  vst1q_u8(*dst, row_1);
+  *dst += stride;
+  vst1q_u8(*dst, row_2);
+  *dst += stride;
+  vst1q_u8(*dst, row_3);
+  *dst += stride;
+  vst1q_u8(*dst, row_4);
+  *dst += stride;
+  vst1q_u8(*dst, row_5);
+  *dst += stride;
+  vst1q_u8(*dst, row_6);
+  *dst += stride;
+  vst1q_u8(*dst, row_7);
+  *dst += stride;
+}
+
 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const uint8x16_t left_u8q = vld1q_u8(left);
-  uint8x8_t left_u8d = vget_low_u8(left_u8q);
-  uint8x16_t d;
-  int i;
   (void)above;
 
-  for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
-    d = vdupq_lane_u8(left_u8d, 0);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 1);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 2);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 3);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 4);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 5);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 6);
-    vst1q_u8(dst, d);
-    dst += stride;
-    d = vdupq_lane_u8(left_u8d, 7);
-    vst1q_u8(dst, d);
-    dst += stride;
-  }
+  h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+  h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x8_t left) {
+  const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+  const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+  const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+  const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+  const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+  const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+  const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+  const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+  vst1q_u8(*dst, row_0);  // Note clang-3.8 produced poor code w/vst2q_u8
+  *dst += 16;
+  vst1q_u8(*dst, row_0);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_1);
+  *dst += 16;
+  vst1q_u8(*dst, row_1);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_2);
+  *dst += 16;
+  vst1q_u8(*dst, row_2);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_3);
+  *dst += 16;
+  vst1q_u8(*dst, row_3);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_4);
+  *dst += 16;
+  vst1q_u8(*dst, row_4);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_5);
+  *dst += 16;
+  vst1q_u8(*dst, row_5);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_6);
+  *dst += 16;
+  vst1q_u8(*dst, row_6);
+  *dst += stride - 16;
+  vst1q_u8(*dst, row_7);
+  *dst += 16;
+  vst1q_u8(*dst, row_7);
+  *dst += stride - 16;
 }
 
 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  uint8x16_t d;
   int i;
   (void)above;
 
   for (i = 0; i < 2; i++, left += 16) {
     const uint8x16_t left_u8 = vld1q_u8(left);
-    const uint8x8_t left_low = vget_low_u8(left_u8);
-    const uint8x8_t left_high = vget_high_u8(left_u8);
-    d = vdupq_lane_u8(left_low, 0);
-    vst1q_u8(dst, d);  // Note clang-3.8 produced poor code w/vst2q_u8
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 1);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 2);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 3);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 4);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 5);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 6);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_low, 7);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-
-    d = vdupq_lane_u8(left_high, 0);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 1);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 2);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 3);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 4);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 5);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 6);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
-    d = vdupq_lane_u8(left_high, 7);
-    vst1q_u8(dst, d);
-    dst += 16;
-    vst1q_u8(dst, d);
-    dst += stride - 16;
+    h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+    h_store_32x8(&dst, stride, vget_high_u8(left_u8));
   }
 }
 
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 445add296..4fa8ff115 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -141,6 +141,71 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   *a1 = d0.val[1];
 }
 
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+                                     const int16x4_t a2, const int16x4_t a3,
+                                     const int16x4_t a4, const int16x4_t a5,
+                                     const int16x4_t a6, const int16x4_t a7,
+                                     int16x8_t *o0, int16x8_t *o1,
+                                     int16x8_t *o2, int16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  const int16x4x2_t b0 = vtrn_s16(a0, a1);
+  const int16x4x2_t b1 = vtrn_s16(a2, a3);
+  const int16x4x2_t b2 = vtrn_s16(a4, a5);
+  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                                  vreinterpret_s32_s16(b3.val[0]));
+  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                                  vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
                                     uint8x8_t *a3) {
   // Swap 8 bit elements. Goes from:
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
index 589b124e2..b2d94795d 100644
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -156,14 +156,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
                             int flimit) {
   int r, c, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
 
   for (c = 0; c < cols; c++) {
     unsigned char *s = &dst[c];
     int sumsq = 0;
     int sum = 0;
     unsigned char d[16];
-    const int16_t *rv2 = rv3 + ((c * 17) & 127);
 
     for (i = -8; i < 0; i++) s[i * pitch] = s[0];
 
@@ -183,7 +181,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
       d[r & 15] = s[0];
 
       if (sumsq * 15 - sum * sum < flimit) {
-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+        d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
       }
       if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
       s += pitch;
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 46ddd1da0..f3f543ddf 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -96,6 +96,7 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
 void idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
+
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -114,9 +115,9 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
 
   // Rows
@@ -142,6 +143,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
   int i;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
@@ -157,6 +159,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
 void idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
+
   // stage 1
   step1[0] = input[0];
   step1[2] = input[4];
@@ -209,9 +212,9 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
@@ -236,6 +239,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
@@ -246,14 +250,13 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_low_t x0 = input[0];
   tran_low_t x1 = input[1];
   tran_low_t x2 = input[2];
   tran_low_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
+    memset(output, 0, 4 * sizeof(*output));
     return;
   }
 
@@ -283,7 +286,6 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_high_t x0 = input[7];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[5];
@@ -294,8 +296,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
+    memset(output, 0, 8 * sizeof(*output));
     return;
   }
 
@@ -359,13 +360,13 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
-  // only first 4 row has non-zero coefs
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     idct8_c(input, outptr);
     input += 8;
@@ -550,9 +551,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows
@@ -576,7 +577,6 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
   tran_high_t x0 = input[15];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[13];
@@ -596,9 +596,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
         x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+    memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
@@ -746,9 +744,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows. Since all non-zero dct coefficients are in
@@ -774,6 +772,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
@@ -1151,9 +1150,9 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
                               int stride) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
@@ -1188,13 +1187,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 16x16 has non-zero coeff
+  // Only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1214,13 +1213,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 8x8 has non-zero coeff
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1241,8 +1240,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
@@ -1373,12 +1372,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
@@ -1389,9 +1388,9 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -1418,10 +1417,10 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1452,12 +1451,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
@@ -1472,8 +1471,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1489,20 +1488,20 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1518,9 +1517,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1567,10 +1567,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
+  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1608,14 +1608,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1631,10 +1631,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1642,10 +1642,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x4, bd);
@@ -1657,22 +1657,23 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
+  // First transform rows
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
-  // Then transform columns.
+
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1726,23 +1727,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1752,12 +1753,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -1771,12 +1772,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -1786,12 +1787,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -1803,8 +1804,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -1829,12 +1830,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -1859,20 +1860,20 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 16; ++i) {
     vpx_highbd_idct16_c(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -1936,22 +1937,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1979,14 +1980,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -2010,18 +2011,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (-cospi_16_64) * (x2 + x3);
@@ -2033,14 +2034,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x8, bd);
@@ -2062,9 +2063,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2076,7 +2077,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -2092,10 +2093,10 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2137,43 +2138,43 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2187,23 +2188,23 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
@@ -2230,12 +2231,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -2250,22 +2251,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2274,12 +2275,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -2289,12 +2290,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2324,8 +2325,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -2341,20 +2342,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2375,12 +2376,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2426,20 +2427,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2482,9 +2483,9 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2520,19 +2521,20 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 
 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
-  // Only upper-left 8x8 has non-zero coeff.
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
+
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
@@ -2549,10 +2551,10 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -2560,4 +2562,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest += stride;
   }
 }
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index e530730d5..13137659f 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
   (void)bd;
   return input;
 }
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_EMULATE_HARDWARE
diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c
index 31812299c..b4ed6ee85 100644
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -403,8 +403,11 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_y, int filter_y_stride, int w,
                             int h) {
   int x, y;
-  uint32_t tp1, tp2, tn1;
-  uint32_t tp3, tp4, tn2;
+  uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c
index f6812c7d0..8d35b6394 100644
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -1307,6 +1307,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
+  (void)x_step_q4;
 
   /* bit positon for extract from acc */
   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
@@ -1398,6 +1399,10 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int filter_y_stride,
                              int w, int h) {
   int x, y;
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index cc633c669..ba52e8095 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -573,7 +573,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
 void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
                               int32_t cols, int32_t flimit) {
   int32_t row, col, cnt, i;
-  const int16_t *rv3 = &vpx_rv[63 & rand()];
   v4i32 flimit_vec;
   v16u8 dst7, dst8, dst_r_b, dst_l_b;
   v16i8 mask;
@@ -601,7 +600,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
 
     dst = LD_UB(dst_tmp);
     for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
-      rv2[i] = rv3 + ((cnt * 17) & 127);
+      rv2[i] = vpx_rv + (i & 7);
       ++i;
     }
     for (cnt = -8; cnt < 0; ++cnt) {
diff --git a/vpx_dsp/mips/intrapred16_dspr2.c b/vpx_dsp/mips/intrapred16_dspr2.c
index 3e29d0ac3..835e10e12 100644
--- a/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/vpx_dsp/mips/intrapred16_dspr2.c
@@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
   int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/vpx_dsp/mips/intrapred4_dspr2.c b/vpx_dsp/mips/intrapred4_dspr2.c
index 9f51d50c7..dce03a2b2 100644
--- a/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/vpx_dsp/mips/intrapred4_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                    \n\t"
diff --git a/vpx_dsp/mips/intrapred8_dspr2.c b/vpx_dsp/mips/intrapred8_dspr2.c
index eac79d510..16e7fc550 100644
--- a/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/vpx_dsp/mips/intrapred8_dspr2.c
@@ -14,6 +14,7 @@
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;
 
   __asm__ __volatile__(
       "lb         %[tmp1],      (%[left])                   \n\t"
diff --git a/vpx_dsp/mips/txfm_macros_msa.h b/vpx_dsp/mips/txfm_macros_msa.h
index da100f6a9..f077fa481 100644
--- a/vpx_dsp/mips/txfm_macros_msa.h
+++ b/vpx_dsp/mips/txfm_macros_msa.h
@@ -15,19 +15,24 @@
 
 #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
   {                                                           \
-    v8i16 k0_m = __msa_fill_h(cnst0);                         \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                             \
+    v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                 \
+    v8i16 k0_m, k1_m, k2_m, zero = { 0 };                     \
                                                               \
-    s0_m = (v4i32)__msa_fill_h(cnst1);                        \
-    k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m);                  \
+    k0_m = __msa_fill_h(cnst0);                               \
+    k1_m = __msa_fill_h(cnst1);                               \
+    k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m);                  \
+    k0_m = __msa_ilvev_h((v8i16)zero, k0_m);                  \
+    k1_m = __msa_ilvev_h(k1_m, (v8i16)zero);                  \
                                                               \
-    ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m);                   \
+    ILVRL_H2_SW(reg1, reg0, s5_m, s4_m);                      \
     ILVRL_H2_SW(reg0, reg1, s3_m, s2_m);                      \
-    DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m);          \
+    s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m);          \
+    s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
                                                               \
-    DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m);          \
+    DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m);          \
     SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS);                  \
     out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m);           \
   }
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 8c91b141f..bb1143cca 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -48,6 +48,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -212,7 +213,7 @@ endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_add_neon.c
-DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_135_add_neon.c
 
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
@@ -244,6 +245,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
 
 endif  # CONFIG_VP9
 
@@ -252,6 +254,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 71015c439..bba6b4f78 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -214,6 +214,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -222,33 +223,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -257,33 +264,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
+  specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -292,33 +305,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
@@ -327,27 +346,32 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_h_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
+  specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
@@ -640,7 +664,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
@@ -679,7 +703,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add neon sse2/;
@@ -690,8 +714,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct8x8_64_add sse2/;
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_12_add sse2/;
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct16x16_256_add sse2/;
@@ -759,9 +783,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 135 eob idct32x32 implementations.
     $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
-    $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
 
@@ -1728,11 +1750,9 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
     specialize qw/vpx_mbpost_proc_down sse2 msa/;
-    $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
 
     add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
     specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
-    $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
     specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
index 6df360df4..ebca50930 100644
--- a/vpx_dsp/x86/deblock_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -230,11 +230,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
-;                            int pitch, int rows, int cols,int flimit)
+;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
+;                               int pitch, int rows, int cols,int flimit)
 extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_xmm) PRIVATE
-sym(vpx_mbpost_proc_down_xmm):
+global sym(vpx_mbpost_proc_down_sse2) PRIVATE
+sym(vpx_mbpost_proc_down_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -462,10 +462,10 @@ sym(vpx_mbpost_proc_down_xmm):
 %undef flimit4
 
 
-;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
-;                                int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vpx_mbpost_proc_across_ip_xmm):
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+;                                    int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+sym(vpx_mbpost_proc_across_ip_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/fdct.h
new file mode 100644
index 000000000..54a6d81fc
--- /dev/null
+++ b/vpx_dsp/x86/fdct.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+                        (int16_t)a[6], (int16_t)a[7]);
+#else
+  return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+  _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(a), zero);
+  _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+  _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif  // VPX_DSP_X86_FDCT_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 00d18f917..d5fc1440c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3673,7 +3673,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
@@ -4017,8 +4017,8 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   tran_low_t out;
 
-  out = highbd_dct_const_round_shift(input[0] * cospi_16_64);
-  out = highbd_dct_const_round_shift(out * cospi_16_64);
+  out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
   a = ROUND_POWER_OF_TWO(out, 6);
 
   d = _mm_set1_epi32(a);
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 2c7e431c7..0580a7bd7 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,32 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
-  return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
-}
+#include "vpx_dsp/x86/fdct.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
         // Do DC and first 15 AC
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
 
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index 4aae30e94..c1f1b6027 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h
@@ -21,6 +21,8 @@
 /*
  * Win32 specific includes
  */
+#undef NOMINMAX
+#define NOMINMAX
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 6ba02cf1f..11c98fc76 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -140,6 +140,9 @@ static INLINE uint64_t xgetbv(void) {
 #endif
 
 #if defined(_MSC_VER) && _MSC_VER >= 1700
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
 #define getenv(x) NULL
diff --git a/vpxdec.c b/vpxdec.c
index f1b09e657..2cdb69d5a 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -92,31 +92,19 @@ static const arg_def_t md5arg =
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
-
-static const arg_def_t *all_args[] = { &codecarg,
-                                       &use_yv12,
-                                       &use_i420,
-                                       &flipuvarg,
-                                       &rawvideo,
-                                       &noblitarg,
-                                       &progressarg,
-                                       &limitarg,
-                                       &skiparg,
-                                       &postprocarg,
-                                       &summaryarg,
-                                       &outputfile,
-                                       &threadsarg,
-                                       &frameparallelarg,
-                                       &verbosearg,
-                                       &scalearg,
-                                       &fb_arg,
-                                       &md5arg,
-                                       &error_concealment,
-                                       &continuearg,
+static const arg_def_t svcdecodingarg = ARG_DEF(
+    NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
+
+static const arg_def_t *all_args[] = {
+  &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,         &rawvideo,
+  &noblitarg,      &progressarg, &limitarg,   &skiparg,           &postprocarg,
+  &summaryarg,     &outputfile,  &threadsarg, &frameparallelarg,  &verbosearg,
+  &scalearg,       &fb_arg,      &md5arg,     &error_concealment, &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                       &outbitdeptharg,
+  &outbitdeptharg,
 #endif
-                                       NULL };
+  &svcdecodingarg, NULL
+};
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -519,6 +507,8 @@ static int main_loop(int argc, const char **argv_) {
 #if CONFIG_VP9_HIGHBITDEPTH
   unsigned int output_bit_depth = 0;
 #endif
+  int svc_decoding = 0;
+  int svc_spatial_layer = 0;
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
 #endif
@@ -610,6 +600,10 @@ static int main_loop(int argc, const char **argv_) {
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+    else if (arg_match(&arg, &svcdecodingarg, argi)) {
+      svc_decoding = 1;
+      svc_spatial_layer = arg_parse_uint(&arg);
+    }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -726,7 +720,14 @@ static int main_loop(int argc, const char **argv_) {
             vpx_codec_error(&decoder));
     goto fail2;
   }
-
+  if (svc_decoding) {
+    if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER,
+                          svc_spatial_layer)) {
+      fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n",
+              vpx_codec_error(&decoder));
+      goto fail;
+    }
+  }
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER