34 files changed, 898 insertions, 649 deletions
diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 7272424af..029cc4a56 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl
@@ -138,14 +138,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # RN to .req
-    if (s/RN\s+([Rr]\d+|lr)/.req $1/)
-    {
-        print;
-        print "$comment_sub$comment\n" if defined $comment;
-        next;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 1a9e105ba..e1ae7b4f8 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -120,18 +120,6 @@ while (<STDIN>)
     s/DCD(.*)/.long $1/;
     s/DCB(.*)/.byte $1/;
 
-    # Build a hash of all the register - alias pairs.
-    if (s/(.*)RN(.*)/$1 .req $2/g)
-    {
-        $register_aliases{trim($1)} = trim($2);
-        next;
-    }
-
-    while (($key, $value) = each(%register_aliases))
-    {
-        s/\b$key\b/$value/g;
-    }
-
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
     s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f050fa06a..007e02000 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -936,6 +936,7 @@ EOF
             # only "AppContainerApplication" which requires an AppxManifest.
             # Therefore disable the examples, just build the library.
             disable_feature examples
+            disable_feature tools
           fi
           ;;
         rvct)
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index e3395afa2..2cf62c117 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -82,7 +82,7 @@ generate_filter() {
                        | sed -e "s,$src_path_bare,," \
                              -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
 
-                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
+                if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then
                     # Avoid object file name collisions, i.e. vpx_config.c and
                     # vpx_config.asm produce the same object file without
                     # this additional suffix.
@@ -452,7 +452,7 @@ generate_vcxproj() {
     done
 
     open_tag ItemGroup
-    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s"
+    generate_filter "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S"
     close_tag ItemGroup
     open_tag ItemGroup
     generate_filter "Header Files"   "h;hm;inl;inc;xsd"
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
new file mode 100644
index 000000000..69f62f13b
--- /dev/null
+++ b/test/decode_svc_test.cc
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+
+namespace {
+
+const unsigned int kNumFrames = 19;
+
+class DecodeSvcTest : public ::libvpx_test::DecoderTest,
+                      public ::libvpx_test::CodecTestWithParam<const char *> {
+ protected:
+  DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
+  virtual ~DecodeSvcTest() {}
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (video.frame_number() == 0)
+      decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_EQ(img.d_w, width_);
+    ASSERT_EQ(img.d_h, height_);
+    total_frames_ = frame_number;
+  }
+
+  int spatial_layer_;
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int total_frames_;
+};
+
+// SVC test vector is 1280x720, with 3 spatial layers, and 20 frames.
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 0. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/4x1/4 resolution (320x180).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 0;
+  width_ = 320;
+  height_ = 180;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 1. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in 1/2x1/2 resolution (640x360).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 1;
+  width_ = 640;
+  height_ = 360;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 2. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This results in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 2;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+// Decode the SVC test vector, which has 3 spatial layers, and decode up to
+// spatial layer 10. Verify the resolution of each decoded frame and the total
+// number of frames decoded. This is beyond the number of spatial layers, so
+// the decoding should result in the full resolution (1280x720).
+TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
+  const std::string filename = GET_PARAM(1);
+  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  video.reset(new libvpx_test::IVFVideoSource(filename));
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+  total_frames_ = 0;
+  spatial_layer_ = 10;
+  width_ = 1280;
+  height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  ASSERT_EQ(total_frames_, kNumFrames);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc,
+                                       libvpx_test::kVP9TestVectorsSvc +
+                                           libvpx_test::kNumVP9TestVectorsSvc));
+}  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 3ac73c125..409a76bfc 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -105,20 +105,20 @@ void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
 
 #if HAVE_SSE2
 
-void idct8x8_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 10);
+void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
 }
 
-void idct8x8_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_c(in, out, stride, 12);
+void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
 }
 
-void idct8x8_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 10);
+void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
 }
 
-void idct8x8_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct8x8_10_add_sse2(in, out, stride, 12);
+void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
 }
 
 void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
@@ -728,10 +728,10 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, InvTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&idct8x8_10_add_10_c, &idct8x8_10_add_10_sse2, 6225,
+        make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225,
                    VPX_BITS_10),
         make_tuple(&idct8x8_10, &idct8x8_64_add_10_sse2, 6225, VPX_BITS_10),
-        make_tuple(&idct8x8_10_add_12_c, &idct8x8_10_add_12_sse2, 6225,
+        make_tuple(&idct8x8_12_add_12_c, &idct8x8_12_add_12_sse2, 6225,
                    VPX_BITS_12),
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 12eaa80e7..2921e5ddf 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -188,6 +188,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
     "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" },
   { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
   { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
+  { 2, "invalid-crbug-629481.webm" },
 };
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 0c704c5c8..3539591bb 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -32,6 +32,38 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, InvTxfmFunc, TX_SIZE, int>
     PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
+
+// https://bugs.chromium.org/p/webm/issues/detail?id=1332
+// The functions specified do not pass with INT16_MIN/MAX. They fail at the
+// value specified, but pass when 1 is added/subtracted.
+int16_t MaxSupportedCoeff(InvTxfmFunc a) {
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \
+    !CONFIG_EMULATE_HARDWARE
+  if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) {
+    return 23625 - 1;
+  }
+#else
+  (void)a;
+#endif
+  return INT16_MAX;
+}
+
+int16_t MinSupportedCoeff(InvTxfmFunc a) {
+  (void)a;
+#if !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
+  if (a == vpx_idct8x8_64_add_ssse3 || a == vpx_idct8x8_12_add_ssse3) {
+    return -23625 + 1;
+  }
+#elif HAVE_NEON
+  if (a == vpx_idct4x4_16_add_neon) {
+    return INT16_MIN + 1;
+  }
+#endif
+#endif  // !CONFIG_EMULATE_HARDWARE
+  return INT16_MIN;
+}
+
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
   virtual ~PartialIDctTest() {}
@@ -142,14 +174,14 @@ TEST_P(PartialIDctTest, ResultsMatch) {
     memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
     int max_energy_leftover = max_coeff * max_coeff;
     for (int j = 0; j < last_nonzero_; ++j) {
-      int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
-                                          (rnd.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coef * coef;
+      int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
+                                           (rnd.Rand16() - 32768) / 65536);
+      max_energy_leftover -= coeff * coeff;
       if (max_energy_leftover < 0) {
         max_energy_leftover = 0;
-        coef = 0;
+        coeff = 0;
       }
-      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coeff;
     }
 
     ASM_REGISTER_STATE_CHECK(
@@ -186,6 +218,33 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
         << "Error: Transform results are not correctly added to output.";
   }
 }
+
+TEST_P(PartialIDctTest, SingleLargeCoeff) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int16_t max_coeff = MaxSupportedCoeff(partial_itxfm_);
+  const int16_t min_coeff = MinSupportedCoeff(partial_itxfm_);
+  for (int i = 0; i < last_nonzero_; ++i) {
+    memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+    // Run once for min and once for max.
+    for (int j = 0; j < 2; ++j) {
+      const int coeff = j ? min_coeff : max_coeff;
+
+      memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+      memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
+      input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
+
+      ASM_REGISTER_STATE_CHECK(
+          full_itxfm_(input_block_, output_block_ref_, size_));
+      ASM_REGISTER_STATE_CHECK(
+          partial_itxfm_(input_block_, output_block_, size_));
+
+      ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                          sizeof(*output_block_) * block_size_))
+          << "Error: Fails with single coeff of " << coeff << " at " << i
+          << ".";
+    }
+  }
+}
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 2e34fed06..4f6592647 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,22 +7,23 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr,
-                             int src_pixels_per_line, int dst_pixels_per_line,
-                             int cols, unsigned char *flimit, int size);
+typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+    unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
+    int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
 
 namespace {
 
-class VPxPostProcessingFilterTest
-    : public ::testing::TestWithParam<PostProcFunc> {
+class VpxPostProcDownAndAcrossMbRowTest
+    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 };
@@ -30,7 +31,7 @@ class VPxPostProcessingFilterTest
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
-TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
   const int block_width = 16;
   const int block_height = 16;
@@ -78,14 +79,14 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
                                       input_stride, output_stride, block_width,
                                       flimits, 16));
 
-  static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1,
-                                                       1, 1, 1, 1, 1, 1, 3, 4 };
+  static const uint8_t kExpectedOutput[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
 
   pixel_ptr = dst_image_ptr;
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
-      EXPECT_EQ(expected_data[i], pixel_ptr[j])
-          << "VPxPostProcessingFilterTest failed with invalid filter output";
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]);
     }
     pixel_ptr += output_stride;
   }
@@ -96,18 +97,18 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
 };
 
 INSTANTIATE_TEST_CASE_P(
-    C, VPxPostProcessingFilterTest,
+    C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VPxPostProcessingFilterTest,
+    SSE2, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
 #endif
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
-    MSA, VPxPostProcessingFilterTest,
+    MSA, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
 #endif
 
diff --git a/test/test-data.mk b/test/test-data.mk
index 80b802e0a..ebf0cd8c2 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -775,6 +775,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
@@ -867,3 +869,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_1-2.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index b97ae967e..b2af3ebb6 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -840,3 +840,7 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf
 1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
+70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
+7602e00378161ca36ae93cc6ee12dd30b5ba1e1d *vp90-2-22-svc_1280x720_3.ivf
+02e53e3eefbf25ec0929047fe50876acdeb040bd *vp90-2-22-svc_1280x720_3.ivf.md5
diff --git a/test/test.mk b/test/test.mk
index 60218a780..e25463e46 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -35,6 +35,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 460c1f51b..def78da28 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -373,7 +373,9 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-20-big_superframe-02.webm",
   RESIZE_TEST_VECTORS
 };
+const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
+const int kNumVP9TestVectorsSvc = NELEMENTS(kVP9TestVectorsSvc);
 const char *const kVP9TestVectorsResize[] = { RESIZE_TEST_VECTORS };
 const int kNumVP9TestVectorsResize = NELEMENTS(kVP9TestVectorsResize);
 #undef RESIZE_TEST_VECTORS
diff --git a/test/test_vectors.h b/test/test_vectors.h
index 2c6918abd..3df3e8113 100644
--- a/test/test_vectors.h
+++ b/test/test_vectors.h
@@ -23,6 +23,8 @@ extern const char *const kVP8TestVectors[];
 #if CONFIG_VP9_DECODER
 extern const int kNumVP9TestVectors;
 extern const char *const kVP9TestVectors[];
+extern const int kNumVP9TestVectorsSvc;
+extern const char *const kVP9TestVectorsSvc[];
 extern const int kNumVP9TestVectorsResize;
 extern const char *const kVP9TestVectorsResize[];
 #endif  // CONFIG_VP9_DECODER
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index c6a39f85c..e3a088e28 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -331,8 +331,8 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
   // DC only DCT coefficient
   if (eob == 1) {
     vpx_highbd_idct8x8_1_add(input, dest, stride, bd);
-  } else if (eob <= 10) {
-    vpx_highbd_idct8x8_10_add(input, dest, stride, bd);
+  } else if (eob <= 12) {
+    vpx_highbd_idct8x8_12_add(input, dest, stride, bd);
   } else {
     vpx_highbd_idct8x8_64_add(input, dest, stride, bd);
   }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index fafc65983..abef06763 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -137,6 +137,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
   add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant ssse3/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2 msa sse2/;
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fde0b7e31..628d1c8d2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     return 0;
   }
 
-  tile_data->xd.error_info = &tile_data->error_info;
   tile_data->xd.corrupted = 0;
 
   do {
@@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                         &tile_data->error_info, &tile_data->bit_reader,
                         pbi->decrypt_cb, pbi->decrypt_state);
     vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+    // init resets xd.error_info
+    tile_data->xd.error_info = &tile_data->error_info;
 
     for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
          mi_row += MI_BLOCK_SIZE) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c12f95c51..2a5800382 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2801,13 +2801,13 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
         cpi->rc.source_alt_ref_pending,
@@ -3218,6 +3218,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   vpx_clear_system_state();
 }
 
+#define MAX_QSTEP_ADJ 4
+static int get_qstep_adj(int rate_excess, int rate_limit) {
+  int qstep =
+      rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX;
+  return VPXMIN(qstep, MAX_QSTEP_ADJ);
+}
+
 static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                     uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
@@ -3391,6 +3398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // to attempt to recode.
         int last_q = q;
         int retries = 0;
+        int qstep;
 
         if (cpi->resize_pending == 1) {
           // Change in frame size so go back around the recode loop.
@@ -3416,7 +3424,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
             q_high = rc->worst_quality;
 
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          qstep =
+              get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
+          q_low = VPXMIN(q + qstep, q_high);
+          // q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3441,7 +3452,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          qstep =
+              get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
+          q_high = VPXMAX(q - qstep, q_low);
+          // q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index ce9007ac3..2b7ddbcd9 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1573,7 +1573,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         !(frame_mv[this_mode][ref_frame].as_int == 0 &&
           ref_frame == LAST_FRAME)) {
       if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var) {
+        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
           i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
           if ((cpi->ref_frame_flags & flag_list[i]))
             if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index fb2a92541..b3c3d7beb 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -12,14 +12,17 @@
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/fdct.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
 void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs,
+    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
     int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr,
     const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-    int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
     uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
   __m128i zero;
   int pass;
@@ -328,15 +331,15 @@ void vp9_fdct8x8_quant_ssse3(
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -398,20 +401,21 @@ void vp9_fdct8x8_quant_ssse3(
           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
         } else {
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+          // Maybe a more efficient way to store 0?
+          store_zero_tran_low(qcoeff_ptr + n_coeffs);
+          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-          _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
         }
       }
 
@@ -452,10 +456,10 @@ void vp9_fdct8x8_quant_ssse3(
     }
   } else {
     do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4a1ebbc8c..0a3e84a0d 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -553,6 +553,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                    ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK) return res;
 
+  if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
+    frame_count = ctx->svc_spatial_layer + 1;
+
   if (ctx->frame_parallel_decode) {
     // Decode in frame parallel mode. When decoding in this mode, the frame
     // passed to the decoder must be either a normal frame or a superframe with
@@ -1001,6 +1004,16 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->svc_decoding = 1;
+  ctx->svc_spatial_layer = va_arg(args, int);
+  if (ctx->svc_spatial_layer < 0)
+    return VPX_CODEC_INVALID_PARAM;
+  else
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -1011,6 +1024,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VPXD_SET_DECRYPTOR, ctrl_set_decryptor },
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
 
   // Getters
   { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
diff --git a/vp9/vp9_dx_iface.h b/vp9/vp9_dx_iface.h
index cc3d51842..c1559599b 100644
--- a/vp9/vp9_dx_iface.h
+++ b/vp9/vp9_dx_iface.h
@@ -60,6 +60,10 @@ struct vpx_codec_alg_priv {
   void *ext_priv;  // Private data associated with the external frame buffers.
   vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
   vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+  // Allow for decoding up to a given spatial layer for SVC stream.
+  int svc_decoding;
+  int svc_spatial_layer;
 };
 
 #endif  // VP9_VP9_DX_IFACE_H_
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 88204acd3..0d7759eb2 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -111,6 +111,11 @@ enum vp8_dec_control_id {
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
+  /** control function to decode SVC stream up to the x spatial layers,
+   * where x is passed in through the control, and is 0 for base layer.
+   */
+  VP9_DECODE_SVC_SPATIAL_LAYER,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -162,6 +167,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
index 4226b28f3..ebec9df54 100644
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -12,126 +12,9 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-// Multiply a by a_const. Saturate, shift and narrow by 14.
-static int16x8_t multiply_shift_and_narrow(const int16x8_t a,
-                                           const int16_t a_const) {
-  // Shift by 14 + rounding will be within 16 bits for well formed streams.
-  // See WRAPLOW and dct_const_round_shift for details.
-  // This instruction doubles the result and returns the high half, essentially
-  // resulting in a right shift by 15. By multiplying the constant first that
-  // becomes a right shift by 14.
-  // The largest possible value used here is
-  // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
-  // within the range of int16_t (+32767 / -32768) even when negated.
-  return vqrdmulhq_n_s16(a, a_const * 2);
-}
-
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
-static int16x8_t add_multiply_shift_and_narrow(const int16x8_t a,
-                                               const int16x8_t b,
-                                               const int16_t ab_const) {
-  // In both add_ and its pair, sub_, the input for well-formed streams will be
-  // well within 16 bits (input to the idct is the difference between two frames
-  // and will be within -255 to 255, or 9 bits)
-  // However, for inputs over about 25,000 (valid for int16_t, but not for idct
-  // input) this function can not use vaddq_s16.
-  // In order to match existing behavior and intentionally out of range tests,
-  // expand the addition up to 32 bits to prevent truncation.
-  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
-}
-
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
-static int16x8_t sub_multiply_shift_and_narrow(const int16x8_t a,
-                                               const int16x8_t b,
-                                               const int16_t ab_const) {
-  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
-}
-
-// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
-static int16x8_t multiply_accumulate_shift_and_narrow(const int16x8_t a,
-                                                      const int16_t a_const,
-                                                      const int16x8_t b,
-                                                      const int16_t b_const) {
-  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
-  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
-  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
-  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
-}
-
-// Shift the output down by 6 and add it to the destination buffer.
-static void add_and_store(const int16x8_t a0, const int16x8_t a1,
-                          const int16x8_t a2, const int16x8_t a3,
-                          const int16x8_t a4, const int16x8_t a5,
-                          const int16x8_t a6, const int16x8_t a7, uint8_t *b,
-                          const int b_stride) {
-  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
-  b0 = vld1_u8(b);
-  b += b_stride;
-  b1 = vld1_u8(b);
-  b += b_stride;
-  b2 = vld1_u8(b);
-  b += b_stride;
-  b3 = vld1_u8(b);
-  b += b_stride;
-  b4 = vld1_u8(b);
-  b += b_stride;
-  b5 = vld1_u8(b);
-  b += b_stride;
-  b6 = vld1_u8(b);
-  b += b_stride;
-  b7 = vld1_u8(b);
-  b -= (7 * b_stride);
-
-  // c = b + (a >> 6)
-  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
-  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
-  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
-  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
-  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
-  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
-  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
-  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
-
-  b0 = vqmovun_s16(c0);
-  b1 = vqmovun_s16(c1);
-  b2 = vqmovun_s16(c2);
-  b3 = vqmovun_s16(c3);
-  b4 = vqmovun_s16(c4);
-  b5 = vqmovun_s16(c5);
-  b6 = vqmovun_s16(c6);
-  b7 = vqmovun_s16(c7);
-
-  vst1_u8(b, b0);
-  b += b_stride;
-  vst1_u8(b, b1);
-  b += b_stride;
-  vst1_u8(b, b2);
-  b += b_stride;
-  vst1_u8(b, b3);
-  b += b_stride;
-  vst1_u8(b, b4);
-  b += b_stride;
-  vst1_u8(b, b5);
-  b += b_stride;
-  vst1_u8(b, b6);
-  b += b_stride;
-  vst1_u8(b, b7);
-}
-
 // Only for the first pass of the  _34_ variant. Since it only uses values from
 // the top left 8x8 it can safely assume all the remaining values are 0 and skip
 // an awful lot of calculations. In fact, only the first 6 columns make the cut.
@@ -163,66 +46,51 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
       s2_31;
   int16x8_t s3_24, s3_25, s3_26, s3_27;
 
-  in0 = vld1q_s16(input);
-  input += 32;
-  in1 = vld1q_s16(input);
-  input += 32;
-  in2 = vld1q_s16(input);
-  input += 32;
-  in3 = vld1q_s16(input);
-  input += 32;
-  in4 = vld1q_s16(input);
-  input += 32;
-  in5 = vld1q_s16(input);
-  input += 32;
-  in6 = vld1q_s16(input);
-  input += 32;
-  in7 = vld1q_s16(input);
-
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
 
   // stage 1
   // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
-  s1_16 = multiply_shift_and_narrow(in1, cospi_31_64);
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
   // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
-  s1_31 = multiply_shift_and_narrow(in1, cospi_1_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
 
-  s1_20 = multiply_shift_and_narrow(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow(in5, cospi_5_64);
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow(in3, cospi_3_64);
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow(in2, cospi_2_64);
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow(in4, cospi_4_64);
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow(s1_16, -cospi_4_64, s1_31,
-                                               cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow(s1_16, cospi_28_64, s1_31,
-                                               cospi_4_64);
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow(s1_20, -cospi_20_64, s1_27,
-                                               cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow(s1_20, cospi_12_64, s1_27,
-                                               cospi_20_64);
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow(s1_23, -cospi_12_64, s1_24,
-                                               -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow(s1_23, -cospi_20_64, s1_24,
-                                               cospi_12_64);
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow(in0, cospi_16_64);
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow(s2_8, -cospi_8_64, s2_15,
-                                              cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow(s2_8, cospi_24_64, s2_15,
-                                               cospi_8_64);
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
 
   s2_20 = vsubq_s16(s1_23, s1_20);
   s2_21 = vsubq_s16(s1_22, s1_21);
@@ -234,28 +102,28 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
   s2_27 = vsubq_s16(s1_24, s1_27);
 
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow(s1_4, s1_7, cospi_16_64);
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
 
-  s1_18 = multiply_accumulate_shift_and_narrow(s1_17, -cospi_8_64, s1_30,
-                                               cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow(s1_17, cospi_24_64, s1_30,
-                                               cospi_8_64);
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
+                                                   cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow(s1_16, -cospi_8_64, s1_31,
-                                               cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow(s1_16, cospi_24_64, s1_31,
-                                               cospi_8_64);
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
+                                                   cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow(s2_20, -cospi_24_64, s2_27,
-                                               -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow(s2_20, -cospi_8_64, s2_27,
-                                               cospi_24_64);
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow(s2_21, -cospi_24_64, s2_26,
-                                               -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow(s2_21, -cospi_8_64, s2_26,
-                                               cospi_24_64);
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
 
   // stage 6
   s2_0 = vaddq_s16(s1_0, s1_7);
@@ -267,11 +135,11 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
   s2_6 = vsubq_s16(s1_0, s1_6);
   s2_7 = vsubq_s16(s1_0, s1_7);
 
-  s2_10 = sub_multiply_shift_and_narrow(s2_14, s2_9, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow(s2_9, s2_14, cospi_16_64);
+  s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow(s2_15, s2_8, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow(s2_8, s2_15, cospi_16_64);
+  s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
 
   s2_16 = vaddq_s16(s1_16, s2_23);
   s2_17 = vaddq_s16(s1_17, s2_22);
@@ -309,17 +177,17 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
   s1_14 = vsubq_s16(s2_1, s2_14);
   s1_15 = vsubq_s16(s2_0, s2_15);
 
-  s1_20 = sub_multiply_shift_and_narrow(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow(s2_20, s3_27, cospi_16_64);
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow(s2_21, s3_26, cospi_16_64);
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
 
-  s1_22 = sub_multiply_shift_and_narrow(s3_25, s2_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow(s2_22, s3_25, cospi_16_64);
+  s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
 
-  s1_23 = sub_multiply_shift_and_narrow(s3_24, s2_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow(s2_23, s3_24, cospi_16_64);
+  s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
 
   // final stage
   vst1q_s16(output, vaddq_s16(s1_0, s2_31));
@@ -403,82 +271,67 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
       s2_31;
   int16x8_t s3_24, s3_25, s3_26, s3_27;
 
-  in0 = vld1q_s16(input);
-  input += 8;
-  in1 = vld1q_s16(input);
-  input += 8;
-  in2 = vld1q_s16(input);
-  input += 8;
-  in3 = vld1q_s16(input);
-  input += 8;
-  in4 = vld1q_s16(input);
-  input += 8;
-  in5 = vld1q_s16(input);
-  input += 8;
-  in6 = vld1q_s16(input);
-  input += 8;
-  in7 = vld1q_s16(input);
-
-  transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
+  load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
+                             &in7);
 
   // stage 1
-  s1_16 = multiply_shift_and_narrow(in1, cospi_31_64);
-  s1_31 = multiply_shift_and_narrow(in1, cospi_1_64);
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
 
   // Different for _8_
-  s1_19 = multiply_shift_and_narrow(in7, -cospi_25_64);
-  s1_28 = multiply_shift_and_narrow(in7, cospi_7_64);
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
 
-  s1_20 = multiply_shift_and_narrow(in5, cospi_27_64);
-  s1_27 = multiply_shift_and_narrow(in5, cospi_5_64);
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
 
-  s1_23 = multiply_shift_and_narrow(in3, -cospi_29_64);
-  s1_24 = multiply_shift_and_narrow(in3, cospi_3_64);
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
 
   // stage 2
-  s2_8 = multiply_shift_and_narrow(in2, cospi_30_64);
-  s2_15 = multiply_shift_and_narrow(in2, cospi_2_64);
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
 
-  s2_11 = multiply_shift_and_narrow(in6, -cospi_26_64);
-  s2_12 = multiply_shift_and_narrow(in6, cospi_6_64);
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
 
   // stage 3
-  s1_4 = multiply_shift_and_narrow(in4, cospi_28_64);
-  s1_7 = multiply_shift_and_narrow(in4, cospi_4_64);
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
 
-  s1_17 = multiply_accumulate_shift_and_narrow(s1_16, -cospi_4_64, s1_31,
-                                               cospi_28_64);
-  s1_30 = multiply_accumulate_shift_and_narrow(s1_16, cospi_28_64, s1_31,
-                                               cospi_4_64);
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
 
   // Different for _8_
-  s1_18 = multiply_accumulate_shift_and_narrow(s1_19, -cospi_28_64, s1_28,
-                                               -cospi_4_64);
-  s1_29 = multiply_accumulate_shift_and_narrow(s1_19, -cospi_4_64, s1_28,
-                                               cospi_28_64);
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
+                                                   -cospi_4_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
+                                                   cospi_28_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow(s1_20, -cospi_20_64, s1_27,
-                                               cospi_12_64);
-  s1_26 = multiply_accumulate_shift_and_narrow(s1_20, cospi_12_64, s1_27,
-                                               cospi_20_64);
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
 
-  s1_22 = multiply_accumulate_shift_and_narrow(s1_23, -cospi_12_64, s1_24,
-                                               -cospi_20_64);
-  s1_25 = multiply_accumulate_shift_and_narrow(s1_23, -cospi_20_64, s1_24,
-                                               cospi_12_64);
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
 
   // stage 4
-  s1_0 = multiply_shift_and_narrow(in0, cospi_16_64);
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
 
-  s2_9 = multiply_accumulate_shift_and_narrow(s2_8, -cospi_8_64, s2_15,
-                                              cospi_24_64);
-  s2_14 = multiply_accumulate_shift_and_narrow(s2_8, cospi_24_64, s2_15,
-                                               cospi_8_64);
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
 
-  s2_10 = multiply_accumulate_shift_and_narrow(s2_11, -cospi_24_64, s2_12,
-                                               -cospi_8_64);
-  s2_13 = multiply_accumulate_shift_and_narrow(s2_11, -cospi_8_64, s2_12,
-                                               cospi_24_64);
+  s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
+                                                   -cospi_8_64);
+  s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+                                                   cospi_24_64);
 
   s2_16 = vaddq_s16(s1_16, s1_19);
 
@@ -504,8 +357,8 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   s2_31 = vaddq_s16(s1_28, s1_31);
 
   // stage 5
-  s1_5 = sub_multiply_shift_and_narrow(s1_7, s1_4, cospi_16_64);
-  s1_6 = add_multiply_shift_and_narrow(s1_4, s1_7, cospi_16_64);
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
 
   s1_8 = vaddq_s16(s2_8, s2_11);
   s1_9 = vaddq_s16(s2_9, s2_10);
@@ -516,25 +369,25 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   s1_14 = vaddq_s16(s2_13, s2_14);
   s1_15 = vaddq_s16(s2_12, s2_15);
 
-  s1_18 = multiply_accumulate_shift_and_narrow(s2_18, -cospi_8_64, s2_29,
-                                               cospi_24_64);
-  s1_29 = multiply_accumulate_shift_and_narrow(s2_18, cospi_24_64, s2_29,
-                                               cospi_8_64);
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
+                                                   cospi_8_64);
 
-  s1_19 = multiply_accumulate_shift_and_narrow(s2_19, -cospi_8_64, s2_28,
-                                               cospi_24_64);
-  s1_28 = multiply_accumulate_shift_and_narrow(s2_19, cospi_24_64, s2_28,
-                                               cospi_8_64);
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
+                                                   cospi_8_64);
 
-  s1_20 = multiply_accumulate_shift_and_narrow(s2_20, -cospi_24_64, s2_27,
-                                               -cospi_8_64);
-  s1_27 = multiply_accumulate_shift_and_narrow(s2_20, -cospi_8_64, s2_27,
-                                               cospi_24_64);
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
 
-  s1_21 = multiply_accumulate_shift_and_narrow(s2_21, -cospi_24_64, s2_26,
-                                               -cospi_8_64);
-  s1_26 = multiply_accumulate_shift_and_narrow(s2_21, -cospi_8_64, s2_26,
-                                               cospi_24_64);
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
 
   // stage 6
   s2_0 = vaddq_s16(s1_0, s1_7);
@@ -546,11 +399,11 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   s2_6 = vsubq_s16(s1_0, s1_6);
   s2_7 = vsubq_s16(s1_0, s1_7);
 
-  s2_10 = sub_multiply_shift_and_narrow(s1_13, s1_10, cospi_16_64);
-  s2_13 = add_multiply_shift_and_narrow(s1_10, s1_13, cospi_16_64);
+  s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
 
-  s2_11 = sub_multiply_shift_and_narrow(s1_12, s1_11, cospi_16_64);
-  s2_12 = add_multiply_shift_and_narrow(s1_11, s1_12, cospi_16_64);
+  s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
 
   s1_16 = vaddq_s16(s2_16, s2_23);
   s1_17 = vaddq_s16(s2_17, s2_22);
@@ -588,17 +441,17 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   s1_14 = vsubq_s16(s2_1, s1_14);
   s1_15 = vsubq_s16(s2_0, s1_15);
 
-  s1_20 = sub_multiply_shift_and_narrow(s3_27, s2_20, cospi_16_64);
-  s1_27 = add_multiply_shift_and_narrow(s2_20, s3_27, cospi_16_64);
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
 
-  s1_21 = sub_multiply_shift_and_narrow(s3_26, s2_21, cospi_16_64);
-  s1_26 = add_multiply_shift_and_narrow(s2_21, s3_26, cospi_16_64);
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
 
-  s2_22 = sub_multiply_shift_and_narrow(s3_25, s1_22, cospi_16_64);
-  s1_25 = add_multiply_shift_and_narrow(s1_22, s3_25, cospi_16_64);
+  s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
 
-  s2_23 = sub_multiply_shift_and_narrow(s3_24, s1_23, cospi_16_64);
-  s1_24 = add_multiply_shift_and_narrow(s1_23, s3_24, cospi_16_64);
+  s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
 
   // final stage
   out0 = vaddq_s16(s1_0, s2_31);
@@ -610,7 +463,8 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   out6 = vaddq_s16(s1_6, s1_25);
   out7 = vaddq_s16(s1_7, s1_24);
 
-  add_and_store(out0, out1, out2, out3, out4, out5, out6, out7, output, stride);
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+                       stride);
 
   out0 = vaddq_s16(s1_8, s2_23);
   out1 = vaddq_s16(s1_9, s2_22);
@@ -621,8 +475,8 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   out6 = vaddq_s16(s1_14, s1_17);
   out7 = vaddq_s16(s1_15, s1_16);
 
-  add_and_store(out0, out1, out2, out3, out4, out5, out6, out7,
-                output + (8 * stride), stride);
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (8 * stride), stride);
 
   out0 = vsubq_s16(s1_15, s1_16);
   out1 = vsubq_s16(s1_14, s1_17);
@@ -633,8 +487,8 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   out6 = vsubq_s16(s1_9, s2_22);
   out7 = vsubq_s16(s1_8, s2_23);
 
-  add_and_store(out0, out1, out2, out3, out4, out5, out6, out7,
-                output + (16 * stride), stride);
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (16 * stride), stride);
 
   out0 = vsubq_s16(s1_7, s1_24);
   out1 = vsubq_s16(s1_6, s1_25);
@@ -645,8 +499,8 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
   out6 = vsubq_s16(s1_1, s2_30);
   out7 = vsubq_s16(s1_0, s2_31);
 
-  add_and_store(out0, out1, out2, out3, out4, out5, out6, out7,
-                output + (24 * stride), stride);
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (24 * stride), stride);
 }
 
 void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm
index a223c0b63..f39e8ddd4 100644
--- a/vpx_dsp/arm/idct_neon.asm
+++ b/vpx_dsp/arm/idct_neon.asm
@@ -27,3 +27,4 @@
     vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
     ENDIF
     MEND
+    END
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index 00be7ede8..5c2a53c03 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -14,6 +14,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 //------------------------------------------------------------------------------
@@ -31,4 +32,141 @@ static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
 #endif
 }
 
+// Multiply a by a_const. Saturate, shift and narrow by 14.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+                                                      const int16_t a_const) {
+  // Shift by 14 + rounding will be within 16 bits for well formed streams.
+  // See WRAPLOW and dct_const_round_shift for details.
+  // This instruction doubles the result and returns the high half, essentially
+  // resulting in a right shift by 15. By multiplying the constant first that
+  // becomes a right shift by 14.
+  // The largest possible value used here is
+  // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+  // within the range of int16_t (+32767 / -32768) even when negated.
+  return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  // In both add_ and it's pair, sub_, the input for well-formed streams will be
+  // well within 16 bits (input to the idct is the difference between two frames
+  // and will be within -255 to 255, or 9 bits)
+  // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+  // input) this function can not use vaddq_s16.
+  // In order to match existing behavior and intentionally out of range tests,
+  // expand the addition up to 32 bits to prevent truncation.
+  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// 14.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+    const int16x8_t a, const int16_t a_const, const int16x8_t b,
+    const int16_t b_const) {
+  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
+  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
+  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
+  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
+                                              int16x8_t *a0, int16x8_t *a1,
+                                              int16x8_t *a2, int16x8_t *a3,
+                                              int16x8_t *a4, int16x8_t *a5,
+                                              int16x8_t *a6, int16x8_t *a7) {
+  *a0 = vld1q_s16(a);
+  a += a_stride;
+  *a1 = vld1q_s16(a);
+  a += a_stride;
+  *a2 = vld1q_s16(a);
+  a += a_stride;
+  *a3 = vld1q_s16(a);
+  a += a_stride;
+  *a4 = vld1q_s16(a);
+  a += a_stride;
+  *a5 = vld1q_s16(a);
+  a += a_stride;
+  *a6 = vld1q_s16(a);
+  a += a_stride;
+  *a7 = vld1q_s16(a);
+
+  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
+                                        const int16x8_t a2, const int16x8_t a3,
+                                        const int16x8_t a4, const int16x8_t a5,
+                                        const int16x8_t a6, const int16x8_t a7,
+                                        uint8_t *b, const int b_stride) {
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  b0 = vld1_u8(b);
+  b += b_stride;
+  b1 = vld1_u8(b);
+  b += b_stride;
+  b2 = vld1_u8(b);
+  b += b_stride;
+  b3 = vld1_u8(b);
+  b += b_stride;
+  b4 = vld1_u8(b);
+  b += b_stride;
+  b5 = vld1_u8(b);
+  b += b_stride;
+  b6 = vld1_u8(b);
+  b += b_stride;
+  b7 = vld1_u8(b);
+  b -= (7 * b_stride);
+
+  // c = b + (a >> 6)
+  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
+  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
+  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
+  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
+  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
+  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
+  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
+  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+
+  b0 = vqmovun_s16(c0);
+  b1 = vqmovun_s16(c1);
+  b2 = vqmovun_s16(c2);
+  b3 = vqmovun_s16(c3);
+  b4 = vqmovun_s16(c4);
+  b5 = vqmovun_s16(c5);
+  b6 = vqmovun_s16(c6);
+  b7 = vqmovun_s16(c7);
+
+  vst1_u8(b, b0);
+  b += b_stride;
+  vst1_u8(b, b1);
+  b += b_stride;
+  vst1_u8(b, b2);
+  b += b_stride;
+  vst1_u8(b, b3);
+  b += b_stride;
+  vst1_u8(b, b4);
+  b += b_stride;
+  vst1_u8(b, b5);
+  b += b_stride;
+  vst1_u8(b, b6);
+  b += b_stride;
+  vst1_u8(b, b7);
+}
 #endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 46ddd1da0..f3f543ddf 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -96,6 +96,7 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
 void idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
+
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -114,9 +115,9 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
 
   // Rows
@@ -142,6 +143,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
   int i;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
@@ -157,6 +159,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
 void idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
+
   // stage 1
   step1[0] = input[0];
   step1[2] = input[4];
@@ -209,9 +212,9 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
@@ -236,6 +239,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
@@ -246,14 +250,13 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 
 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_low_t x0 = input[0];
   tran_low_t x1 = input[1];
   tran_low_t x2 = input[2];
   tran_low_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
+    memset(output, 0, 4 * sizeof(*output));
     return;
   }
 
@@ -283,7 +286,6 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
 
 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
-
   tran_high_t x0 = input[7];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[5];
@@ -294,8 +296,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x7 = input[6];
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
+    memset(output, 0, 8 * sizeof(*output));
     return;
   }
 
@@ -359,13 +360,13 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
 
   // First transform rows
-  // only first 4 row has non-zero coefs
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     idct8_c(input, outptr);
     input += 8;
@@ -550,9 +551,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows
@@ -576,7 +577,6 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
   tran_high_t x0 = input[15];
   tran_high_t x1 = input[0];
   tran_high_t x2 = input[13];
@@ -596,9 +596,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
         x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+    memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
@@ -746,9 +744,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
 
   // First transform rows. Since all non-zero dct coefficients are in
@@ -774,6 +772,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
@@ -1151,9 +1150,9 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
                               int stride) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
@@ -1188,13 +1187,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
                              int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 16x16 has non-zero coeff
+  // Only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1214,13 +1213,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
 
   // Rows
-  // only upper-left 8x8 has non-zero coeff
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     idct32_c(input, outptr);
     input += 32;
@@ -1241,8 +1240,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-
   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
@@ -1373,12 +1372,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
@@ -1389,9 +1388,9 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[4 * 4];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[4], temp_out[4];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -1418,10 +1417,10 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1452,12 +1451,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   vpx_highbd_idct4_c(step1, step1, bd);
@@ -1472,8 +1471,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1489,20 +1488,20 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1518,9 +1517,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1567,10 +1567,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
+  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1608,14 +1608,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1631,10 +1631,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1642,10 +1642,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x4, bd);
@@ -1657,22 +1657,23 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
+  int i, j;
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[8], temp_out[8];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
+  // First transform rows
+  // Only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
     vpx_highbd_idct8_c(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
-  // Then transform columns.
+
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
     vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1726,23 +1727,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1752,12 +1753,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -1771,12 +1772,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -1786,12 +1787,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -1803,8 +1804,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -1829,12 +1830,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -1859,20 +1860,20 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // First transform rows.
+  // First transform rows
   for (i = 0; i < 16; ++i) {
     vpx_highbd_idct16_c(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -1936,22 +1937,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
+  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1979,14 +1980,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
+  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -2010,18 +2011,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
+  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (-cospi_16_64) * (x2 + x3);
@@ -2033,14 +2034,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
 
   output[0] = HIGHBD_WRAPLOW(x0, bd);
   output[1] = HIGHBD_WRAPLOW(-x8, bd);
@@ -2062,9 +2063,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[16 * 16] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[16], temp_out[16];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2076,7 +2077,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
     outptr += 16;
   }
 
-  // Then transform columns.
+  // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
     vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -2092,10 +2093,10 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2137,43 +2138,43 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2187,23 +2188,23 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
@@ -2230,12 +2231,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -2250,22 +2251,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2274,12 +2275,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -2289,12 +2290,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2324,8 +2325,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -2341,20 +2342,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2375,12 +2376,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2426,20 +2427,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2482,9 +2483,9 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32];
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
@@ -2520,19 +2521,20 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
 
 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
+  int i, j;
   tran_low_t out[32 * 32] = { 0 };
   tran_low_t *outptr = out;
-  int i, j;
   tran_low_t temp_in[32], temp_out[32];
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   // Rows
-  // Only upper-left 8x8 has non-zero coeff.
+  // Only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
     highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
+
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
@@ -2549,10 +2551,10 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int i, j;
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
   tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+
+  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -2560,4 +2562,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
     dest += stride;
   }
 }
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index e530730d5..13137659f 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
   (void)bd;
   return input;
 }
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return (tran_high_t)rv;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_EMULATE_HARDWARE
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e9660ed39..d22da102e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -253,6 +253,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index a67cda991..dabf97165 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -649,7 +649,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
@@ -699,8 +699,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct8x8_64_add sse2/;
 
-    add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+    add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vpx_highbd_idct8x8_12_add sse2/;
 
     add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct16x16_256_add sse2/;
diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/fdct.h
new file mode 100644
index 000000000..54a6d81fc
--- /dev/null
+++ b/vpx_dsp/x86/fdct.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+                        (int16_t)a[6], (int16_t)a[7]);
+#else
+  return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+  _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(a), zero);
+  _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+  _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif  // VPX_DSP_X86_FDCT_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 00d18f917..d5fc1440c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3673,7 +3673,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
                                     int stride, int bd) {
   tran_low_t out[8 * 8] = { 0 };
   tran_low_t *outptr = out;
@@ -4017,8 +4017,8 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   tran_low_t out;
 
-  out = highbd_dct_const_round_shift(input[0] * cospi_16_64);
-  out = highbd_dct_const_round_shift(out * cospi_16_64);
+  out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
   a = ROUND_POWER_OF_TWO(out, 6);
 
   d = _mm_set1_epi32(a);
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 2c7e431c7..0580a7bd7 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,32 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
-  return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
-}
+#include "vpx_dsp/x86/fdct.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
         // Do DC and first 15 AC
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         __m128i qtmp0, qtmp1;
         __m128i cmp_mask0, cmp_mask1;
 
-        coeff0 = load_coefficients(coeff_ptr + n_coeffs);
-        coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
         // Poor man's sign extract
         coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 
-        store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
-        store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
-        store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
       }
 
       {
@@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     }
   } else {
     do {
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs);
-      store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs);
-      store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
+      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs);
+      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
       n_coeffs += 8 * 2;
     } while (n_coeffs < 0);
     *eob_ptr = 0;
diff --git a/vpxdec.c b/vpxdec.c
index f1b09e657..2cdb69d5a 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -92,31 +92,19 @@ static const arg_def_t md5arg =
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
-
-static const arg_def_t *all_args[] = { &codecarg,
-                                       &use_yv12,
-                                       &use_i420,
-                                       &flipuvarg,
-                                       &rawvideo,
-                                       &noblitarg,
-                                       &progressarg,
-                                       &limitarg,
-                                       &skiparg,
-                                       &postprocarg,
-                                       &summaryarg,
-                                       &outputfile,
-                                       &threadsarg,
-                                       &frameparallelarg,
-                                       &verbosearg,
-                                       &scalearg,
-                                       &fb_arg,
-                                       &md5arg,
-                                       &error_concealment,
-                                       &continuearg,
+static const arg_def_t svcdecodingarg = ARG_DEF(
+    NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
+
+static const arg_def_t *all_args[] = {
+  &codecarg,       &use_yv12,    &use_i420,   &flipuvarg,         &rawvideo,
+  &noblitarg,      &progressarg, &limitarg,   &skiparg,           &postprocarg,
+  &summaryarg,     &outputfile,  &threadsarg, &frameparallelarg,  &verbosearg,
+  &scalearg,       &fb_arg,      &md5arg,     &error_concealment, &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                       &outbitdeptharg,
+  &outbitdeptharg,
 #endif
-                                       NULL };
+  &svcdecodingarg, NULL
+};
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -519,6 +507,8 @@ static int main_loop(int argc, const char **argv_) {
 #if CONFIG_VP9_HIGHBITDEPTH
   unsigned int output_bit_depth = 0;
 #endif
+  int svc_decoding = 0;
+  int svc_spatial_layer = 0;
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 };
 #endif
@@ -610,6 +600,10 @@ static int main_loop(int argc, const char **argv_) {
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+    else if (arg_match(&arg, &svcdecodingarg, argi)) {
+      svc_decoding = 1;
+      svc_spatial_layer = arg_parse_uint(&arg);
+    }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -726,7 +720,14 @@ static int main_loop(int argc, const char **argv_) {
             vpx_codec_error(&decoder));
     goto fail2;
   }
-
+  if (svc_decoding) {
+    if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER,
+                          svc_spatial_layer)) {
+      fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n",
+              vpx_codec_error(&decoder));
+      goto fail;
+    }
+  }
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER