116 files changed, 3410 insertions, 2137 deletions
diff --git a/.clang-format b/.clang-format
index c1483199e..e76a526e4 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,12 +1,12 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 4.0.1
+# Generated with clang-format 5.0.0
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
 AlignConsecutiveDeclarations: false
-AlignEscapedNewlinesLeft: true
+AlignEscapedNewlines: Left
 AlignOperands:   true
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
@@ -33,14 +33,20 @@ BraceWrapping:
   BeforeCatch:     false
   BeforeElse:      false
   IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
 BreakBeforeBinaryOperators: None
 BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
 BreakAfterJavaFieldAnnotations: false
 BreakStringLiterals: true
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
@@ -48,7 +54,11 @@ Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
-ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
 IncludeCategories:
   - Regex:           '^<.*\.h>'
     Priority:        1
@@ -70,6 +80,7 @@ NamespaceIndentation: None
 ObjCBlockIndentWidth: 2
 ObjCSpaceAfterProperty: false
 ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -79,6 +90,7 @@ PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
+SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
diff --git a/.mailmap b/.mailmap
index 166c45ee8..29af51065 100644
--- a/.mailmap
+++ b/.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
diff --git a/AUTHORS b/AUTHORS
index fc2ef8f43..04c287243 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,13 +3,13 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
@@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
@@ -135,6 +145,7 @@ Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -147,6 +158,7 @@ Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/CHANGELOG b/CHANGELOG
index 7e7aec67a..2281394c8 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,28 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
 2017-01-09 v1.6.1 "Long Tailed Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
diff --git a/README b/README
index f910ce761..73304dd62 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 26 January 2017
+README - 24 January 2018
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
+    ppc64-linux-gcc
+    ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 0987cbfb8..d49a2307d 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -429,8 +429,9 @@ static void set_rate_control_stats(struct RateControlStats *rc,
         rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
       if (tl > 0) {
         rc->layer_pfb[layer] =
-            1000.0 * (cfg->layer_target_bitrate[layer] -
-                      cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+             cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
         rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@@ -573,8 +574,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
       } else {
         if (is_key_frame) {
           ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
         } else {
           ref_frame_config->frame_flags[sl] =
               VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -588,14 +589,24 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
       } else {
         ref_frame_config->frame_flags[sl] =
             VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        if (sl == num_spatial_layers - 1)
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
       }
     }
     if (tl == 0) {
       ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl)
-        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-      else
+      if (sl) {
+        if (is_key_frame) {
+          ref_frame_config->lst_fb_idx[sl] = sl - 1;
+          ref_frame_config->gld_fb_idx[sl] = sl;
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = sl - 1;
+        }
+      } else {
         ref_frame_config->gld_fb_idx[sl] = 0;
+      }
       ref_frame_config->alt_fb_idx[sl] = 0;
     } else if (tl == 1) {
       ref_frame_config->lst_fb_idx[sl] = sl;
@@ -738,6 +749,8 @@ int main(int argc, const char **argv) {
       // the encode for the whole superframe. The encoder will internally loop
       // over all the spatial layers for the current superframe.
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
+      // set_frame_flags_bypass_model() for case of periodic key frames.
       set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
                                   svc_ctx.spatial_layers, frame_cnt == 0,
                                   &ref_frame_config);
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index f5736ea45..4c985ba52 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -26,7 +26,9 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
 
-#define VP8_ROI_MAP 0
+#define ROI_MAP 0
+
+#define zero(Dest) memset(&Dest, 0, sizeof(Dest));
 
 static const char *exec_name;
 
@@ -99,9 +101,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
   for (i = 0; i < cfg->ts_number_layers; ++i) {
     if (i > 0) {
       rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-                                   rc->layer_target_bitrate[i - 1]) /
-                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] =
+          1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
     }
     rc->layer_input_frames[i] = 0;
     rc->layer_enc_frames[i] = 0;
@@ -164,38 +167,60 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     die("Error: Number of input frames not equal to output! \n");
 }
 
-#if VP8_ROI_MAP
-static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+#if ROI_MAP
+static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+                        vpx_roi_map_t *roi) {
   unsigned int i, j;
-  memset(roi, 0, sizeof(*roi));
+  int block_size = 0;
+  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
+  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
+  if (!is_vp8 && !is_vp9) {
+    die("unsupported codec.");
+  }
+  zero(*roi);
+
+  block_size = is_vp9 && !is_vp8 ? 8 : 16;
 
   // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
   // segment is 16x16 for vp8, 8x8 for vp9.
-  roi->rows = (cfg->g_h + 15) / 16;
-  roi->cols = (cfg->g_w + 15) / 16;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
 
   // Applies delta QP on the segment blocks, varies from -63 to 63.
   // Setting to negative means lower QP (better quality).
   // Below we set delta_q to the extreme (-63) to show strong effect.
-  roi->delta_q[0] = 0;
+  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  zero(roi->delta_q);
   roi->delta_q[1] = -63;
-  roi->delta_q[2] = 0;
-  roi->delta_q[3] = 0;
 
   // Applies delta loopfilter strength on the segment blocks, varies from -63 to
-  // 63. Setting to positive means stronger loopfilter.
-  roi->delta_lf[0] = 0;
-  roi->delta_lf[1] = 0;
-  roi->delta_lf[2] = 0;
-  roi->delta_lf[3] = 0;
-
-  // Applies skip encoding threshold on the segment blocks, varies from 0 to
-  // UINT_MAX. Larger value means more skipping of encoding is possible.
-  // This skip threshold only applies on delta frames.
-  roi->static_threshold[0] = 0;
-  roi->static_threshold[1] = 0;
-  roi->static_threshold[2] = 0;
-  roi->static_threshold[3] = 0;
+  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // segments. VP9 uses all 8 segments.
+  zero(roi->delta_lf);
+
+  if (is_vp8) {
+    // Applies skip encoding threshold on the segment blocks, varies from 0 to
+    // UINT_MAX. Larger value means more skipping of encoding is possible.
+    // This skip threshold only applies on delta frames.
+    zero(roi->static_threshold);
+  }
+
+  if (is_vp9) {
+    // Apply skip segment. Setting to 1 means this block will be copied from
+    // previous frame.
+    zero(roi->skip);
+  }
+
+  if (is_vp9) {
+    // Apply ref frame segment.
+    // -1 : Do not apply this segment.
+    //  0 : Froce using intra.
+    //  1 : Force using last.
+    //  2 : Force using golden.
+    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
+    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+    roi->ref_frame[1] = 1;
+  }
 
   // Use 2 states: 1 is center square, 0 is the rest.
   roi->roi_map =
@@ -563,7 +588,7 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
   int flag_periodicity = 1;
-#if VP8_ROI_MAP
+#if ROI_MAP
   vpx_roi_map_t roi;
 #endif
   vpx_svc_layer_id_t layer_id = { 0, 0 };
@@ -766,8 +791,8 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
-#if VP8_ROI_MAP
-    vp8_set_roi_map(&cfg, &roi);
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
     if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
       die_codec(&codec, "Failed to set ROI map");
 #endif
@@ -784,6 +809,12 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
+#endif
     // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
     // high speed settings, disable its use for those cases for now.
     if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
@@ -911,5 +942,8 @@ int main(int argc, char **argv) {
   for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);
 
   vpx_img_free(&raw);
+#if ROI_MAP
+  free(roi.roi_map);
+#endif
   return EXIT_SUCCESS;
 }
diff --git a/libs.doxy_template b/libs.doxy_template
index 5a8f84728..1eacc8fe2 100644
--- a/libs.doxy_template
+++ b/libs.doxy_template
@@ -943,18 +943,6 @@ GENERATE_XML           = NO
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
diff --git a/libs.mk b/libs.mk
index ba75b4a4f..a3e2f9d0e 100644
--- a/libs.mk
+++ b/libs.mk
@@ -233,8 +233,8 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 5
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc
index 2fa10192f..08190c90d 100644
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -215,7 +215,7 @@ using std::tr1::make_tuple;
 
 #if CONFIG_VP9_ENCODER
 const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
diff --git a/test/consistency_test.cc b/test/consistency_test.cc
index 37b4a45e5..c21751f71 100644
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -205,7 +205,7 @@ using std::tr1::make_tuple;
 
 #if CONFIG_VP9_ENCODER
 const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
+  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238)
 };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                         ::testing::ValuesIn(c_vp9_tests));
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 40eb972c0..65f63d654 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -44,7 +44,7 @@ class DatarateTestLarge
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
     gf_boost_ = 0;
-    use_roi_ = 0;
+    use_roi_ = false;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -55,11 +55,9 @@ class DatarateTestLarge
       encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
     }
 
-#if CONFIG_VP8_ENCODER
-    if (use_roi_ == 1) {
+    if (use_roi_) {
       encoder->Control(VP8E_SET_ROI_MAP, &roi_);
     }
-#endif
 
     if (denoiser_offon_test_) {
       ASSERT_GT(denoiser_offon_period_, 0)
@@ -152,7 +150,7 @@ class DatarateTestLarge
   int denoiser_offon_period_;
   int set_cpu_used_;
   int gf_boost_;
-  int use_roi_;
+  bool use_roi_;
   vpx_roi_map_t roi_;
 };
 
@@ -441,7 +439,7 @@ TEST_P(DatarateTestRealTime, RegionOfInterest) {
   ResetModel();
 
   // Set ROI parameters
-  use_roi_ = 1;
+  use_roi_ = true;
   memset(&roi_, 0, sizeof(roi_));
 
   roi_.rows = (cfg_.g_h + 15) / 16;
@@ -539,6 +537,7 @@ class DatarateTestVP9Large
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
     frame_parallel_decoding_mode_ = 1;
+    use_roi_ = false;
   }
 
   //
@@ -621,6 +620,10 @@ class DatarateTestVP9Large
     encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
                      frame_parallel_decoding_mode_);
 
+    if (use_roi_) {
+      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+    }
+
     if (cfg_.ts_number_layers > 1) {
       if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
@@ -701,6 +704,8 @@ class DatarateTestVP9Large
   int denoiser_offon_test_;
   int denoiser_offon_period_;
   int frame_parallel_decoding_mode_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
 };
 
 // Check basic rate targeting for VBR mode with 0 lag.
@@ -1073,6 +1078,68 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
   }
 }
 
+class DatarateTestVP9RealTime : public DatarateTestVP9Large {
+ public:
+  virtual ~DatarateTestVP9RealTime() {}
+};
+
+// Check VP9 region of interest feature.
+TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
+  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 7) / 8;
+  roi_.cols = (cfg_.g_w + 7) / 8;
+
+  roi_.delta_q[1] = -20;
+  roi_.delta_lf[1] = -20;
+  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+  roi_.ref_frame[1] = 1;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map = reinterpret_cast<uint8_t *>(
+      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
+  ASSERT_TRUE(roi_.roi_map != NULL);
+
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
  public:
@@ -1224,12 +1291,70 @@ class DatarateOnePassCbrSvc
     base_speed_setting_ = 5;
     spatial_layer_id_ = 0;
     temporal_layer_id_ = 0;
+    update_pattern_ = 0;
     memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
     memset(bits_total_, 0, sizeof(bits_total_));
     memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
     dynamic_drop_layer_ = false;
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode, except that we disable inter-layer prediction.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    for (int sl = 0; sl < num_spatial_layers; ++sl) {
+      if (!tl) {
+        if (!sl) {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
+              VP8_EFLAG_NO_UPD_ARF;
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->frame_flags[sl] =
+                VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+                VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF;
+          } else {
+            ref_frame_config->frame_flags[sl] =
+                VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
+                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
+              VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        } else {
+          ref_frame_config->frame_flags[sl] =
+              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
+              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_REF_GF;
+        }
+      }
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+    }
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
@@ -1255,6 +1380,21 @@ class DatarateOnePassCbrSvc
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
     }
 
+    if (update_pattern_ && video->frame() >= 100) {
+      vpx_svc_layer_id_t layer_id;
+      if (video->frame() == 100) {
+        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+        encoder->Config(&cfg_);
+      }
+      // Set layer id since the pattern changed.
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
+
     if (dynamic_drop_layer_) {
       if (video->frame() == 100) {
         // Change layer bitrates to set top layer to 0. This will trigger skip
@@ -1351,6 +1491,14 @@ class DatarateOnePassCbrSvc
               << "Buffer Underrun at frame " << pkt->data.frame.pts;
         }
       }
+
+      ASSERT_EQ(pkt->data.frame.width[sl],
+                top_sl_width_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
+
+      ASSERT_EQ(pkt->data.frame.height[sl],
+                top_sl_height_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
     }
   }
 
@@ -1393,6 +1541,10 @@ class DatarateOnePassCbrSvc
   int number_temporal_layers_;
   int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
   bool dynamic_drop_layer_;
+  unsigned int top_sl_width_;
+  unsigned int top_sl_height_;
+  vpx_svc_ref_frame_config_t ref_frame_config;
+  int update_pattern_;
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                   const vpx_svc_extra_cfg_t *svc_params,
@@ -1486,6 +1638,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
   number_spatial_layers_ = cfg_.ss_number_layers;
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
   cfg_.rc_target_bitrate = 500;
   ResetModel();
   tune_content_ = 1;
@@ -1527,6 +1681,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
   for (int i = 200; i <= 800; i += 200) {
@@ -1577,6 +1733,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
   // For SVC, noise_sen = 1 means denoising only the top spatial layer
@@ -1633,6 +1791,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 64; j <= 67; j++) {
@@ -1675,6 +1835,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
   number_spatial_layers_ = cfg_.ss_number_layers;
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
@@ -1722,6 +1884,60 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // Number of temporal layers > 1, so half of the frames in this SVC pattern
+  // will be non-reference frame and hence encoder will avoid loopfilter.
+  // Since frame dropper is off, we can expect 200 (half of the sequence)
+  // mismatched frames.
+  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
+// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
+// inter-layer prediction.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL2TLDynamicPatternChange) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 3;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 2;
+  svc_params_.scaling_factor_num[0] = 72;
+  svc_params_.scaling_factor_den[0] = 288;
+  svc_params_.scaling_factor_num[1] = 144;
+  svc_params_.scaling_factor_den[1] = 288;
+  svc_params_.scaling_factor_num[2] = 288;
+  svc_params_.scaling_factor_den[2] = 288;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
@@ -1769,6 +1985,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL_to_2SL_dynamic) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   dynamic_drop_layer_ = true;
@@ -1812,6 +2030,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 32; j <= 35; j++) {
@@ -1856,6 +2076,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
   number_spatial_layers_ = cfg_.ss_number_layers;
   number_temporal_layers_ = cfg_.ts_number_layers;
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
@@ -1911,6 +2133,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
   bits_in_buffer_model_[1] =
       cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
                           number_temporal_layers_, file_datarate_, 0.78, 1.15);
@@ -1926,6 +2150,9 @@ VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
                           ::testing::Values(::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
                           ::testing::Range(2, 9));
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Range(5, 9));
 #if CONFIG_VP9_TEMPORAL_DENOISING
 VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
                           ::testing::Values(::libvpx_test::kRealTime),
diff --git a/test/dct_test.cc b/test/dct_test.cc
index a5ac9a0dc..71b36c7bb 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -597,7 +597,9 @@ class TransHT : public TransTestBase {
   TransHT() { fwd_txfm_ref = fht_ref; }
 };
 
-TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(TransHT, AccuracyCheck) {
+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1);
+}
 
 TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
 
@@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2),
-  */
-
 const DctParam c_ht_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vp9_highbd_fht16x16_c,
@@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = {
   make_tuple(&vp9_highbd_fht16x16_c,
              &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
              VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
+             VPX_BITS_12, 2),
+
   make_tuple(&vp9_highbd_fht8x8_c,
              &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8,
              2),
@@ -748,6 +752,69 @@ INSTANTIATE_TEST_CASE_P(C, TransHT, ::testing::ValuesIn(c_ht_tests));
 
 #if !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_NEON
+
+const DctParam neon_ht_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0,
+             VPX_BITS_8, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1,
+             VPX_BITS_8, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2,
+             VPX_BITS_8, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3,
+             VPX_BITS_8, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0,
+             VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1,
+             VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2,
+             VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3,
+             VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht4x4_c,
+             &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3,
+             VPX_BITS_12, 2),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 0,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 2,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 3,
+             VPX_BITS_8, 1),
+
+  make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 0,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 2,
+             VPX_BITS_8, 1),
+  make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 3,
+             VPX_BITS_8, 1)
+};
+
+INSTANTIATE_TEST_CASE_P(NEON, TransHT, ::testing::ValuesIn(neon_ht_tests));
+#endif  // HAVE_NEON
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, TransHT,
@@ -784,6 +851,80 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, TransHT,
     ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_12, 2),
+
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_12, 2),
+
         make_tuple(&vp9_highbd_fht4x4_c,
                    &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>, 4, 0,
                    VPX_BITS_8, 2),
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 164db5a7b..87e29b61d 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,4 +106,90 @@ TEST(EncodeAPI, ImageSizeSetting) {
 }
 #endif
 
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+    const vpx_codec_iface_t *const iface = kCodecs[c];
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    const char kVP9Name[] = "WebM Project VP9";
+    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+                                sizeof(kVP9Name) - 1) == 0;
+    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
 }  // namespace
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 89a3b1767..bec6ca3e6 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -128,6 +128,11 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -137,15 +142,12 @@ class Encoder {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
-#endif
 
-#if CONFIG_VP8_ENCODER
   void Control(int ctrl_id, vpx_roi_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 #endif
-
   void Config(const vpx_codec_enc_cfg_t *cfg) {
     const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 79220b0f6..43a4c6929 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -123,6 +123,7 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 #if CONFIG_VP8_DECODER
 const DecodeParam kVP8InvalidFileTests[] = {
   { 1, "invalid-bug-1443.ivf" },
+  { 1, "invalid-token-partition.ivf" },
 };
 
 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 39952074b..5f80af6fb 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -278,10 +278,10 @@ class ResizeTest
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    ASSERT_NE(static_cast<int>(pkt->data.frame.width), 0);
-    ASSERT_NE(static_cast<int>(pkt->data.frame.height), 0);
-    encode_frame_width_.push_back(pkt->data.frame.width);
-    encode_frame_height_.push_back(pkt->data.frame.height);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
   }
 
   unsigned int GetFrameWidth(size_t idx) const {
@@ -485,10 +485,10 @@ class ResizeRealtimeTest
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    ASSERT_NE(static_cast<int>(pkt->data.frame.width), 0);
-    ASSERT_NE(static_cast<int>(pkt->data.frame.height), 0);
-    encode_frame_width_.push_back(pkt->data.frame.width);
-    encode_frame_height_.push_back(pkt->data.frame.height);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
   }
 
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 9c407c649..e9d2efaa5 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -112,8 +112,9 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
-                                                 &vpx_sum_squares_2d_i16_c,
-                                                 &vpx_sum_squares_2d_i16_msa)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
 }  // namespace
diff --git a/test/test-data.mk b/test/test-data.mk
index f405e4ef1..7ca11bc9c 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -734,6 +734,8 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 99b4e1e46..3a23ff5db 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -852,5 +852,7 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
 fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
 fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
+90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 30641ae8c..3405e4566 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -61,7 +61,6 @@ int main(int argc, char **argv) {
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
-
 #if CONFIG_VP8
   vp8_rtcd();
 #endif  // CONFIG_VP8
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 955f567ce..d0cbfbbda 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
   ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };
 
 // Speed settings tested
diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc
index 1030204ae..5dd2ddb47 100644
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -22,7 +22,7 @@ namespace {
 // Encoding modes
 const libvpx_test::TestMode kEncodingModeVectors[] = {
   ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };
 
 // Encoding speeds
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 8713ac655..f2234f045 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -532,6 +532,14 @@ INSTANTIATE_TEST_CASE_P(
                                  false)));
 #endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
 
+#if ARCH_X86_64 && HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+
 // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc
index 5d7d38e89..f3e7f0a0e 100644
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase,
         scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
   }
 
-  void RunTest() {
+  void RunTest(INTERP_FILTER filter_type) {
     static const int kNumSizesToTest = 20;
     static const int kNumScaleFactorsToTest = 4;
     static const int kSizesToTest[] = {
@@ -55,50 +55,48 @@ class ScaleTest : public VpxScaleBase,
       22, 24, 26, 28, 30, 32, 34, 68, 128, 134
     };
     static const int kScaleFactors[] = { 1, 2, 3, 4 };
-    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
-      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
-        for (int h = 0; h < kNumSizesToTest; ++h) {
-          const int src_height = kSizesToTest[h];
-          for (int w = 0; w < kNumSizesToTest; ++w) {
-            const int src_width = kSizesToTest[w];
-            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
-                 ++sf_up_idx) {
-              const int sf_up = kScaleFactors[sf_up_idx];
-              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
-                   ++sf_down_idx) {
-                const int sf_down = kScaleFactors[sf_down_idx];
-                const int dst_width = src_width * sf_up / sf_down;
-                const int dst_height = src_height * sf_up / sf_down;
-                if (sf_up == sf_down && sf_up != 1) {
-                  continue;
-                }
-                // I420 frame width and height must be even.
-                if (!dst_width || !dst_height || dst_width & 1 ||
-                    dst_height & 1) {
-                  continue;
-                }
-                // vpx_convolve8_c() has restriction on the step which cannot
-                // exceed 64 (ratio 1 to 4).
-                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
-                  continue;
-                }
-                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
-                    src_width, src_height, dst_width, dst_height));
-                ReferenceScaleFrame(filter_type, phase_scaler);
-                ScaleFrame(filter_type, phase_scaler);
-                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
-                           ref_img_.frame_size)) {
-                  printf(
-                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
-                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
-                      "scale factor = %d:%d\n",
-                      filter_type, phase_scaler, src_width, src_height,
-                      dst_width, dst_height, sf_down, sf_up);
-                  PrintDiff();
-                }
-                CompareImages(dst_img_);
-                DeallocScaleImages();
+    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+      for (int h = 0; h < kNumSizesToTest; ++h) {
+        const int src_height = kSizesToTest[h];
+        for (int w = 0; w < kNumSizesToTest; ++w) {
+          const int src_width = kSizesToTest[w];
+          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+               ++sf_up_idx) {
+            const int sf_up = kScaleFactors[sf_up_idx];
+            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                 ++sf_down_idx) {
+              const int sf_down = kScaleFactors[sf_down_idx];
+              const int dst_width = src_width * sf_up / sf_down;
+              const int dst_height = src_height * sf_up / sf_down;
+              if (sf_up == sf_down && sf_up != 1) {
+                continue;
               }
+              // I420 frame width and height must be even.
+              if (!dst_width || !dst_height || dst_width & 1 ||
+                  dst_height & 1) {
+                continue;
+              }
+              // vpx_convolve8_c() has restriction on the step which cannot
+              // exceed 64 (ratio 1 to 4).
+              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                continue;
+              }
+              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                                                       dst_width, dst_height));
+              ReferenceScaleFrame(filter_type, phase_scaler);
+              ScaleFrame(filter_type, phase_scaler);
+              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                         ref_img_.frame_size)) {
+                printf(
+                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                    "scale factor = %d:%d\n",
+                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                    dst_height, sf_down, sf_up);
+                PrintDiff();
+              }
+              CompareImages(dst_img_);
+              DeallocScaleImages();
             }
           }
         }
@@ -145,7 +143,10 @@ class ScaleTest : public VpxScaleBase,
   ScaleFrameFunc scale_fn_;
 };
 
-TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
+TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
 
 TEST_P(ScaleTest, DISABLED_Speed) {
   static const int kCountSpeedTestBlock = 100;
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 576f5e906..8eff3f11f 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -147,7 +147,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
-
 #if CONFIG_WEBM_IO
 struct FileList {
   const char *name;
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 1a3aad16a..ddcc307eb 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -37,7 +37,9 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1
 
-typedef struct { int r, c; } POS;
+typedef struct {
+  int r, c;
+} POS;
 
 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -180,6 +182,9 @@ typedef struct {
   unsigned int low_res_ref_frames[MAX_REF_FRAMES];
   // The video frame counter value for the key frame, for lowest resolution.
   unsigned int key_frame_counter_value;
+  // Flags to signal skipped encoding of previous and base layer stream.
+  unsigned int skip_encoding_prev_stream;
+  unsigned int skip_encoding_base_stream;
   LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
diff --git a/vp8/common/extend.c b/vp8/common/extend.c
index 2d67b516b..f4dbce2cd 100644
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c
@@ -20,8 +20,7 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                   int et,           /* extend top border */
                                   int el,           /* extend left border */
                                   int eb,           /* extend bottom border */
-                                  int er            /* extend right border */
-                                  ) {
+                                  int er) {         /* extend right border */
   int i;
   unsigned char *src_ptr1, *src_ptr2;
   unsigned char *dest_ptr1, *dest_ptr2;
diff --git a/vp8/common/mips/mmi/idct_blk_mmi.c b/vp8/common/mips/mmi/idct_blk_mmi.c
index f6020ab46..3f6907174 100644
--- a/vp8/common/mips/mmi/idct_blk_mmi.c
+++ b/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -12,7 +12,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
-                                      int stride, int8_t *eobs) {
+                                      int stride, char *eobs) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -33,8 +33,7 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int stride,
-                                       int8_t *eobs) {
+                                       uint8_t *dstv, int stride, char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
diff --git a/vp8/common/modecont.c b/vp8/common/modecont.c
index d6ad9bb99..bab410374 100644
--- a/vp8/common/modecont.c
+++ b/vp8/common/modecont.c
@@ -11,28 +11,16 @@
 #include "entropy.h"
 
 const int vp8_mode_contexts[6][4] = {
-  {
-      /* 0 */
-      7, 1, 1, 143,
-  },
-  {
-      /* 1 */
-      14, 18, 14, 107,
-  },
-  {
-      /* 2 */
-      135, 64, 57, 68,
-  },
-  {
-      /* 3 */
-      60, 56, 128, 65,
-  },
-  {
-      /* 4 */
-      159, 134, 128, 34,
-  },
-  {
-      /* 5 */
-      234, 188, 128, 28,
-  },
+  { /* 0 */
+    7, 1, 1, 143 },
+  { /* 1 */
+    14, 18, 14, 107 },
+  { /* 2 */
+    135, 64, 57, 68 },
+  { /* 3 */
+    60, 56, 128, 65 },
+  { /* 4 */
+    159, 134, 128, 34 },
+  { /* 5 */
+    234, 188, 128, 28 },
 };
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index de59b3894..de836f19d 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -95,9 +95,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch
-
-                                  ) {
+                                  int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned short,
                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
 
@@ -236,9 +234,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                    int src_pixels_per_line, int xoffset,
                                    int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch
-
-                                   ) {
+                                   int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
 
   if (xoffset) {
diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index 04c027cd7..9c529f266 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h
@@ -76,7 +76,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
   }
 
   {
-    register int shift = vp8_norm[range];
+    const int shift = vp8_norm[range];
     range <<= shift;
     value <<= shift;
     count -= shift;
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 077bd3da2..8bfd3cea3 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,
 
 static int read_is_valid(const unsigned char *start, size_t len,
                          const unsigned char *end) {
-  return (start + len > start && start + len <= end);
+  return len != 0 && end > start && len <= (size_t)(end - start);
 }
 
 static unsigned int read_available_partition_size(
diff --git a/vp8/decoder/ec_types.h b/vp8/decoder/ec_types.h
index 0ab08b649..8da561f47 100644
--- a/vp8/decoder/ec_types.h
+++ b/vp8/decoder/ec_types.h
@@ -34,7 +34,9 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
  * macroblock is further divided into block overlaps.
  */
-typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;
+typedef struct {
+  B_OVERLAP overlaps[16];
+} MB_OVERLAP;
 
 /* Structure for keeping track of motion vectors and which reference frame they
  * refer to. Used for motion vector interpolation.
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 5ecacdbb9..3689258f1 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -31,7 +31,9 @@ typedef struct {
   void *ptr2;
 } DECODETHREAD_DATA;
 
-typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;
+typedef struct {
+  MACROBLOCKD mbd;
+} MB_ROW_DEC;
 
 typedef struct {
   int enabled;
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index d0213f75c..aadc8dc71 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -739,24 +739,21 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_yabove_row[i],
-          vpx_memalign(
-              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
+      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (width + (VP8BORDERINPIXELS << 1))));
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_uabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_vabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));
 
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 1593f235e..3c5775f09 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -61,7 +61,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
   int count = br->count;
   unsigned int range = br->range;
   unsigned int lowvalue = br->lowvalue;
-  register int shift;
+  int shift;
 
 #ifdef VP8_ENTROPY_STATS
 #if defined(SECTIONBITS_OUTPUT)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 70f924341..4ea991e52 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     bits_per_mb_at_this_q =
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
-    bits_per_mb_at_this_q = (int)(.5 +
-                                  err_correction_factor * speed_correction *
-                                      cpi->twopass.est_max_qcorrection_factor *
-                                      cpi->twopass.section_max_qfactor *
-                                      (double)bits_per_mb_at_this_q);
+    bits_per_mb_at_this_q =
+        (int)(.5 + err_correction_factor * speed_correction *
+                       cpi->twopass.est_max_qcorrection_factor *
+                       cpi->twopass.section_max_qfactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1086,9 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
     bits_per_mb_at_this_q =
-        (int)(.5 +
-              err_correction_factor * speed_correction * clip_iifactor *
-                  (double)bits_per_mb_at_this_q);
+        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1273,9 +1272,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
    * sum duration is not. Its calculated based on the actual durations of
    * all frames from the first pass.
    */
-  vp8_new_framerate(cpi,
-                    10000000.0 * cpi->twopass.total_stats.count /
-                        cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                             cpi->twopass.total_stats.duration);
 
   cpi->output_framerate = cpi->framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1739,10 +1737,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             /* Dont break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
-                                  (abs_mv_in_out_accumulator > 3.0) ||
-                                  (mv_in_out_accumulator < -2.0) ||
-                                  ((boost_score - old_boost_score) < 2.0)))) {
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > 100.0) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < 2.0)))) {
       boost_score = old_boost_score;
       break;
     }
@@ -1815,8 +1814,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       (next_frame.pcnt_inter > 0.75) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
        (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
-                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
+      (cpi->gfu_boost > 100) &&
+      (cpi->twopass.gf_decay_rate <=
+       (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
   {
     int Boost;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 3c92be32f..2da940199 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2862,7 +2862,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
     fclose(yframe);
 }
 #endif
-/* return of 0 means drop frame */
 
 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -3364,11 +3363,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
 
     if (cpi->oxcf.mr_encoder_id) {
-      // TODO(marpan): This constraint shouldn't be needed, as we would like
-      // to allow for key frame setting (forced or periodic) defined per
-      // spatial layer. For now, keep this in.
-      cm->frame_type = low_res_frame_info->frame_type;
-
       // Check if lower resolution is available for motion vector reuse.
       if (cm->frame_type != KEY_FRAME) {
         cpi->mr_low_res_mv_avail = 1;
@@ -3393,7 +3387,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                      == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
         */
       }
+      // Disable motion vector reuse (i.e., disable any usage of the low_res)
+      // if the previous lower stream is skipped/disabled.
+      if (low_res_frame_info->skip_encoding_prev_stream) {
+        cpi->mr_low_res_mv_avail = 0;
+      }
     }
+    // This stream is not skipped (i.e., it's being encoded), so set this skip
+    // flag to 0. This is needed for the next stream (i.e., which is the next
+    // frame to be encoded).
+    low_res_frame_info->skip_encoding_prev_stream = 0;
 
     // On a key frame: For the lowest resolution, keep track of the key frame
     // counter value. For the higher resolutions, reset the current video
@@ -4782,8 +4785,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->temporal_pattern_counter++;
   }
 
-/* reset to normal state now that we are done. */
-
 #if 0
     {
         char filename[512];
@@ -4999,10 +5000,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
         // be received for that high layer, which will yield an incorrect
         // frame rate (from time-stamp adjustment in above calculation).
         if (cpi->oxcf.mr_encoder_id) {
-          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+          if (!low_res_frame_info->skip_encoding_base_stream)
+            cpi->ref_framerate = low_res_frame_info->low_res_framerate;
         } else {
           // Keep track of frame rate for lowest resolution.
           low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+          // The base stream is being encoded so set skip flag to 0.
+          low_res_frame_info->skip_encoding_base_stream = 0;
         }
       }
 #endif
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index ab9b8fdc8..fc833bccc 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1052,9 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
    * overflow when values are large
    */
   projected_size_based_on_q =
-      (int)(((.5 +
-              rate_correction_factor *
-                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
+      (int)(((.5 + rate_correction_factor *
+                       vp8_bits_per_mb[cpi->common.frame_type][Q]) *
              cpi->common.MBs) /
             (1 << BPER_MB_NORMBITS));
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e210b4410..fa5dde15d 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -770,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
     vp8_quantize_mbuv(x);
 
     rate_to = rd_cost_mbuv(x);
-    this_rate = rate_to +
-                x->intra_uv_mode_cost[xd->frame_type]
-                                     [xd->mode_info_context->mbmi.uv_mode];
+    this_rate =
+        rate_to + x->intra_uv_mode_cost[xd->frame_type]
+                                       [xd->mode_info_context->mbmi.uv_mode];
 
     this_distortion = vp8_mbuverror(x) / 4;
 
diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
index dadbbe3f8..e76699092 100644
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h
@@ -56,8 +56,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],
 
 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                             const vp8_prob *const p, int v,
-                            int n /* number of bits in v, assumed nonzero */
-                            ) {
+                            int n) { /* number of bits in v, assumed nonzero */
   vp8_tree_index i = 0;
 
   do {
@@ -73,8 +72,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }
 
 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n /* number of bits in v, assumed nonzero */
-                          ) {
+                          int n) { /* number of bits in v, assumed nonzero */
   int c = 0;
   vp8_tree_index i = 0;
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 8f0a782bc..d3e200594 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -802,7 +802,20 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
-  if (!ctx->cfg.rc_target_bitrate) return res;
+  if (!ctx->cfg.rc_target_bitrate) {
+#if CONFIG_MULTI_RES_ENCODING
+    if (!ctx->cpi) return VPX_CODEC_ERROR;
+    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO *low_res_frame_info =
+          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
+      if (!low_res_frame_info) return VPX_CODEC_ERROR;
+      low_res_frame_info->skip_encoding_prev_stream = 1;
+      if (ctx->cpi->oxcf.mr_encoder_id == 0)
+        low_res_frame_info->skip_encoding_base_stream = 1;
+    }
+#endif
+    return res;
+  }
 
   if (img) res = validate_img(ctx, img);
 
@@ -902,8 +915,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
             (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
                             ctx->cfg.g_timebase.num / 10000000);
         pkt.data.frame.flags = lib_flags << 16;
-        pkt.data.frame.width = cpi->common.Width;
-        pkt.data.frame.height = cpi->common.Height;
+        pkt.data.frame.width[0] = cpi->common.Width;
+        pkt.data.frame.height[0] = cpi->common.Height;
 
         if (lib_flags & FRAMEFLAGS_KEY) {
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -1261,6 +1274,9 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
       vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
       vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
       vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
+      vp8e_set_config,
+      NULL,
+      vp8e_get_preview,
+      vp8e_mr_alloc_mem,
   } /* encoder functions */
 };
diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
new file mode 100644
index 000000000..46284238d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void highbd_iadst4(int32x4_t *const io) {
+  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
+  const int32x4_t sinpi = vld1q_s32(sinpis);
+  int32x4_t s[8];
+
+  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
+  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
+  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
+  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
+  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
+  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
+  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
+  s[7] = vsubq_s32(io[0], io[2]);
+  s[7] = vaddq_s32(s[7], io[3]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+
+  io[0] = vaddq_s32(s[0], s[3]);
+  io[1] = vaddq_s32(s[1], s[3]);
+  io[2] = s[2];
+  io[3] = vaddq_s32(s[0], s[1]);
+  io[3] = vsubq_s32(io[3], s[3]);
+  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
+  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
+  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
+  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+}
+
+void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t a[2];
+  int32x4_t c[4];
+
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
+
+  if (bd == 8) {
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_s16_4x4q(&a[0], &a[1]);
+
+    switch (tx_type) {
+      case DCT_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      case ADST_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+
+      case DCT_ADST:
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      default:
+        assert(tx_type == ADST_ADST);
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+    }
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+    }
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
+}
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index 025254c3f..4f0a90f21 100644
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,206 +14,63 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16(cospi_8_64);
-  *d1s16 = vdup_n_s16(cospi_16_64);
-  *d2s16 = vdup_n_s16(cospi_24_64);
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16(sinpi_1_9);
-  *d4s16 = vdup_n_s16(sinpi_2_9);
-  *q3s16 = vdupq_n_s16(sinpi_3_9);
-  *d5s16 = vdup_n_s16(sinpi_4_9);
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vrshrn_n_s32(q13s32, 14);
-  d27s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d28s16 = vrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32(sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vrshrn_n_s32(q13s32, 14);
-  d17s16 = vrshrn_n_s32(q14s32, 14);
-  d18s16 = vrshrn_n_s32(q15s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-}
-
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
+  int16x8_t a[2];
+  uint8x8_t s[2], d[2];
+  uint16x8_t sum[2];
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
 
-  TRANSPOSE4X4(&q8s16, &q9s16);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_s16_4x4q(&a[0], &a[1]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    case 2:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
 
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+    case ADST_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
-    case 3:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
 
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_ADST:
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
+  s[0] = load_u8(dest, stride);
+  s[1] = load_u8(dest + 2 * stride, stride);
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+  store_u8(dest, stride, d[0]);
+  store_u8(dest + 2 * stride, stride, d[1]);
 }
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index 1c739861c..19163bc87 100644
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,527 +14,199 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
-static int16_t cospi_2_64 = 16305;
-static int16_t cospi_4_64 = 16069;
-static int16_t cospi_6_64 = 15679;
-static int16_t cospi_8_64 = 15137;
-static int16_t cospi_10_64 = 14449;
-static int16_t cospi_12_64 = 13623;
-static int16_t cospi_14_64 = 12665;
-static int16_t cospi_16_64 = 11585;
-static int16_t cospi_18_64 = 10394;
-static int16_t cospi_20_64 = 9102;
-static int16_t cospi_22_64 = 7723;
-static int16_t cospi_24_64 = 6270;
-static int16_t cospi_26_64 = 4756;
-static int16_t cospi_28_64 = 3196;
-static int16_t cospi_30_64 = 1606;
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vrshrn_n_s32(q2s32, 14);
-  d9s16 = vrshrn_n_s32(q3s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vrshrn_n_s32(q2s32, 14);
-  d15s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
-
-  d18s16 = vrshrn_n_s32(q2s32, 14);
-  d19s16 = vrshrn_n_s32(q3s32, 14);
-  d22s16 = vrshrn_n_s32(q13s32, 14);
-  d23s16 = vrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vrshrn_n_s32(q2s32, 14);
-  d27s16 = vrshrn_n_s32(q3s32, 14);
-  d30s16 = vrshrn_n_s32(q8s32, 14);
-  d31s16 = vrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vrshrn_n_s32(q9s32, 14);
-  d11s16 = vrshrn_n_s32(q10s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
+static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+                                             const int16x4_t c) {
+  const int16x8_t sum = vaddq_s16(x[0], x[1]);
+  const int16x8_t sub = vsubq_s16(x[0], x[1]);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);
+  t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);
+  t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);
+  t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);
+  x[0] = dct_const_round_shift_low_8(t0);
+  x[1] = dct_const_round_shift_low_8(t1);
 }
 
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  d14s16 = vdup_n_s16(cospi_2_64);
-  d15s16 = vdup_n_s16(cospi_30_64);
-
-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
-
-  d30s16 = vdup_n_s16(cospi_18_64);
-  d31s16 = vdup_n_s16(cospi_14_64);
-
-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
-
-  d22s16 = vrshrn_n_s32(q11s32, 14);
-  d23s16 = vrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
-
-  d2s16 = vrshrn_n_s32(q1s32, 14);
-  d3s16 = vrshrn_n_s32(q2s32, 14);
-  d24s16 = vrshrn_n_s32(q12s32, 14);
-  d25s16 = vrshrn_n_s32(q15s32, 14);
-  d6s16 = vrshrn_n_s32(q3s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  d0s16 = vdup_n_s16(cospi_10_64);
-  d1s16 = vdup_n_s16(cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
-
-  d30s16 = vdup_n_s16(cospi_26_64);
-  d31s16 = vdup_n_s16(cospi_6_64);
-
-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  d18s16 = vrshrn_n_s32(q9s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-  d8s16 = vrshrn_n_s32(q4s32, 14);
-  d9s16 = vrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vrshrn_n_s32(q14s32, 14);
-  d19s16 = vrshrn_n_s32(q15s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vrshrn_n_s32(q1s32, 14);
-  d29s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q7s32, 14);
-  d15s16 = vrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16(cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
+static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+}
 
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+}
 
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q3s32, 14);
-  d24s16 = vrshrn_n_s32(q13s32, 14);
-  d25s16 = vrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
+static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+}
 
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
+static INLINE int16x8_t add_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
 
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+  sum[0] = vaddq_s32(in0[0], in1[0]);
+  sum[1] = vaddq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
 
-  d20s16 = vrshrn_n_s32(q13s32, 14);
-  d21s16 = vrshrn_n_s32(q1s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
+static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
 
-  q5s16 = vdupq_n_s16(0);
+  sum[0] = vsubq_s32(in0[0], in1[0]);
+  sum[1] = vsubq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
 
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
+static INLINE void iadst8(int16x8_t *const io) {
+  const int16x4_t c0 =
+      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int16x4_t c1 =
+      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int16x4_t c2 =
+      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int16x8_t x[8], t[4];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+
+  x[0] = io[7];
+  x[1] = io[0];
+  x[2] = io[5];
+  x[3] = io[2];
+  x[4] = io[3];
+  x[5] = io[4];
+  x[6] = io[1];
+  x[7] = io[6];
+
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+
+  x[0] = add_dct_const_round_shift_low_8(s0, s4);
+  x[1] = add_dct_const_round_shift_low_8(s1, s5);
+  x[2] = add_dct_const_round_shift_low_8(s2, s6);
+  x[3] = add_dct_const_round_shift_low_8(s3, s7);
+  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
+  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
+  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s3, s7);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
+
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+
+  // stage 3
+  iadst_half_butterfly_neon(x + 2, c2);
+  iadst_half_butterfly_neon(x + 6, c2);
+
+  io[0] = x[0];
+  io[1] = vnegq_s16(x[4]);
+  io[2] = x[6];
+  io[3] = vnegq_s16(x[2]);
+  io[4] = x[3];
+  io[5] = vnegq_s16(x[7]);
+  io[6] = x[5];
+  io[7] = vnegq_s16(x[1]);
 }
 
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a[8];
+
+  a[0] = load_tran_low_to_s16q(input + 0 * 8);
+  a[1] = load_tran_low_to_s16q(input + 1 * 8);
+  a[2] = load_tran_low_to_s16q(input + 2 * 8);
+  a[3] = load_tran_low_to_s16q(input + 3 * 8);
+  a[4] = load_tran_low_to_s16q(input + 4 * 8);
+  a[5] = load_tran_low_to_s16q(input + 5 * 8);
+  a[6] = load_tran_low_to_s16q(input + 6 * 8);
+  a[7] = load_tran_low_to_s16q(input + 7 * 8);
+
+  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    case 2:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
 
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
+    case ADST_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
-    case 3:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
 
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_ADST:
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += stride;
-  }
+  idct8x8_add8x8_neon(a, dest, stride);
 }
diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h
new file mode 100644
index 000000000..08daa1a4a
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+#define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void iadst4(int16x8_t *const io) {
+  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
+  int16x4_t x[4];
+  int32x4_t s[8], output[4];
+  const int16x4_t c =
+      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
+
+  x[0] = vget_low_s16(io[0]);
+  x[1] = vget_low_s16(io[1]);
+  x[2] = vget_high_s16(io[0]);
+  x[3] = vget_high_s16(io[1]);
+
+  s[0] = vmull_lane_s16(x[0], c, 0);
+  s[1] = vmull_lane_s16(x[0], c, 1);
+  s[2] = vmull_lane_s16(x[1], c, 2);
+  s[3] = vmull_lane_s16(x[2], c, 3);
+  s[4] = vmull_lane_s16(x[2], c, 0);
+  s[5] = vmull_lane_s16(x[3], c, 1);
+  s[6] = vmull_lane_s16(x[3], c, 3);
+  s[7] = vaddl_s16(x[0], x[3]);
+  s[7] = vsubw_s16(s[7], x[2]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_s32(c3, s[7]);
+
+  output[0] = vaddq_s32(s[0], s[3]);
+  output[1] = vaddq_s32(s[1], s[3]);
+  output[2] = s[2];
+  output[3] = vaddq_s32(s[0], s[1]);
+  output[3] = vsubq_s32(output[3], s[3]);
+  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
+}
+
+#endif  // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index a575bda72..430b917b8 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                           177, 153, 140, 133, 130, 129 };
 #endif
 
+/* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
+/* clang-format on */
 
 const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 1da491166..0ab502592 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
-
 #define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES 3
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 47cd63e94..48cad3318 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
       { 93, 24, 99 },   // a split, l not split
       { 85, 119, 44 },  // l split, a not split
       { 62, 59, 67 },   // a/l both split
+
       // 16x16 -> 8x8
       { 149, 53, 53 },  // a/l both not split
       { 94, 20, 48 },   // a split, l not split
       { 83, 53, 24 },   // l split, a not split
       { 52, 18, 18 },   // a/l both split
+
       // 32x32 -> 16x16
       { 150, 40, 39 },  // a/l both not split
       { 78, 12, 26 },   // a split, l not split
       { 67, 33, 11 },   // l split, a not split
       { 24, 7, 5 },     // a/l both split
+
       // 64x64 -> 32x32
       { 174, 35, 49 },  // a/l both not split
       { 68, 11, 27 },   // a split, l not split
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index a18a290cf..b6f052d08 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
 
 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                                4,  -2, -3 };
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index c7c343aed..da9180b71 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0);
+    border_mask = ~(mi_col == 0 ? 1 : 0);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index a7ddc0b95..15c7e6376 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -229,9 +229,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         else
           pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
       } else {
-        pred_context = 1 +
-                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
       }
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mi);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index dd6120266..d048857dd 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -67,13 +67,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   # Note that there are more specializations appended when
   # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add sse2/;
-  specialize qw/vp9_iht8x8_64_add sse2/;
+  specialize qw/vp9_iht4x4_16_add neon sse2/;
+  specialize qw/vp9_iht8x8_64_add neon sse2/;
   specialize qw/vp9_iht16x16_256_add sse2/;
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
-    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
-    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
     specialize qw/vp9_iht16x16_256_add dspr2 msa/;
   }
 }
@@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
   add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
-  }
 
   add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
+  }
 }
 
 #
@@ -126,7 +129,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 000000000..57b79a732
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+                                s15);
+
+  x0[0] = _mm_add_epi64(s0[0], s8[0]);
+  x0[1] = _mm_add_epi64(s0[1], s8[1]);
+  x1[0] = _mm_add_epi64(s1[0], s9[0]);
+  x1[1] = _mm_add_epi64(s1[1], s9[1]);
+  x2[0] = _mm_add_epi64(s2[0], s10[0]);
+  x2[1] = _mm_add_epi64(s2[1], s10[1]);
+  x3[0] = _mm_add_epi64(s3[0], s11[0]);
+  x3[1] = _mm_add_epi64(s3[1], s11[1]);
+  x4[0] = _mm_add_epi64(s4[0], s12[0]);
+  x4[1] = _mm_add_epi64(s4[1], s12[1]);
+  x5[0] = _mm_add_epi64(s5[0], s13[0]);
+  x5[1] = _mm_add_epi64(s5[1], s13[1]);
+  x6[0] = _mm_add_epi64(s6[0], s14[0]);
+  x6[1] = _mm_add_epi64(s6[1], s14[1]);
+  x7[0] = _mm_add_epi64(s7[0], s15[0]);
+  x7[1] = _mm_add_epi64(s7[1], s15[1]);
+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x0[0] = pack_4(x0[0], x0[1]);
+  x1[0] = pack_4(x1[0], x1[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 2
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  s4[0] = x4[0];
+  s5[0] = x5[0];
+  s6[0] = x6[0];
+  s7[0] = x7[0];
+  x0[0] = _mm_add_epi32(s0[0], s4[0]);
+  x1[0] = _mm_add_epi32(s1[0], s5[0]);
+  x2[0] = _mm_add_epi32(s2[0], s6[0]);
+  x3[0] = _mm_add_epi32(s3[0], s7[0]);
+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+                                s12);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+                                s14);
+
+  x8[0] = _mm_add_epi64(s8[0], s12[0]);
+  x8[1] = _mm_add_epi64(s8[1], s12[1]);
+  x9[0] = _mm_add_epi64(s9[0], s13[0]);
+  x9[1] = _mm_add_epi64(s9[1], s13[1]);
+  x10[0] = _mm_add_epi64(s10[0], s14[0]);
+  x10[1] = _mm_add_epi64(s10[1], s14[1]);
+  x11[0] = _mm_add_epi64(s11[0], s15[0]);
+  x11[1] = _mm_add_epi64(s11[1], s15[1]);
+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 3
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+  s8[0] = x8[0];
+  s9[0] = x9[0];
+  s10[0] = x10[0];
+  s11[0] = x11[0];
+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+                                s14);
+
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = _mm_add_epi32(s8[0], s10[0]);
+  x9[0] = _mm_add_epi32(s9[0], s11[0]);
+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+  x12[0] = _mm_add_epi64(s12[0], s14[0]);
+  x12[1] = _mm_add_epi64(s12[1], s14[1]);
+  x13[0] = _mm_add_epi64(s13[0], s15[0]);
+  x13[1] = _mm_add_epi64(s13[1], s15[1]);
+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 4
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x7[0], x6[0]);
+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+  s10[0] = _mm_add_epi32(x11[0], x10[0]);
+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+  s14[0] = _mm_add_epi32(x14[0], x15[0]);
+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x10[0] = dct_const_round_shift_64bit(s10[0]);
+  x10[1] = dct_const_round_shift_64bit(s10[1]);
+  x11[0] = dct_const_round_shift_64bit(s11[0]);
+  x11[1] = dct_const_round_shift_64bit(s11[1]);
+  x14[0] = dct_const_round_shift_64bit(s14[0]);
+  x14[1] = dct_const_round_shift_64bit(s14[1]);
+  x15[0] = dct_const_round_shift_64bit(s15[0]);
+  x15[1] = dct_const_round_shift_64bit(s15[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+  io[2] = x12[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[4] = x6[0];
+  io[5] = x14[0];
+  io[6] = x10[0];
+  io[7] = x2[0];
+  io[8] = x3[0];
+  io[9] = x11[0];
+  io[10] = x15[0];
+  io[11] = x7[0];
+  io[12] = x5[0];
+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+  io[14] = x9[0];
+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int tx_type, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        idct16_8col(in, in);
+      } else {
+        vpx_iadst16_8col_sse2(in);
+      }
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        idct16_8col(out, out);
+      } else {
+        vpx_iadst16_8col_sse2(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        vpx_highbd_idct16_4col_sse4_1(in);
+      } else {
+        highbd_iadst16_4col_sse4_1(in);
+      }
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        vpx_highbd_idct16_4col_sse4_1(out);
+      } else {
+        highbd_iadst16_4col_sse4_1(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 000000000..7d949b6db
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+  x0[0] = _mm_add_epi64(s0[0], s4[0]);
+  x0[1] = _mm_add_epi64(s0[1], s4[1]);
+  x1[0] = _mm_add_epi64(s1[0], s5[0]);
+  x1[1] = _mm_add_epi64(s1[1], s5[1]);
+  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+  x2[0] = _mm_add_epi64(s2[0], s6[0]);
+  x2[1] = _mm_add_epi64(s2[1], s6[1]);
+  x3[0] = _mm_add_epi64(s3[0], s7[0]);
+  x3[1] = _mm_add_epi64(s3[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
+  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
+  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
+  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 2
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 3
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x6[0], x7[0]);
+  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[2] = x6[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+  io[4] = x3[0];
+  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+  io[6] = x5[0];
+  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+    highbd_idct8x8_final_round(io);
+  }
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 6996260e2..ad693718c 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,8 +10,6 @@
 
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"
 
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
@@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final round and shift
@@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      idct8_sse2(in);
-      idct8_sse2(in);
+    case DCT_DCT:
+      vpx_idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
-      idct8_sse2(in);
+    case ADST_DCT:
+      vpx_idct8_sse2(in);
       iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst8_sse2(in);
       iadst8_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final rounding and shift
@@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    default: assert(0); break;
   }
 
   write_buffer_8x16(dest, in0, stride);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2f2f0055a..ed43de701 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -464,10 +464,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
     }
   }
-  if (cpi->svc.spatial_layer_id > 0) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 12;
-  }
   if (cpi->oxcf.rc_mode == VPX_VBR) {
     // To be adjusted for VBR mode, e.g., based on gf period and boost.
     // For now use smaller qp-delta (than CBR), no second boosted seg, and
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 2f7e54433..52a81afb5 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -12,7 +12,10 @@
 #include "vp9/encoder/vp9_encoder.h"
 
 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
 };
 
 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 682477df1..6517fb358 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1513,9 +1513,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           }
         }
       }
-      if (is_key_frame || (low_res &&
-                           vt.split[i].split[j].part_variances.none.variance >
-                               threshold_4x4avg)) {
+      if (is_key_frame ||
+          (low_res && vt.split[i].split[j].part_variances.none.variance >
+                          threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
@@ -3403,9 +3403,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
         // Rate and distortion based partition search termination clause.
         if (!cpi->sf.ml_partition_search_early_termination &&
-            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-                                   (best_rdc.dist < dist_breakout_thr &&
-                                    best_rdc.rate < rate_breakout_thr))) {
+            !x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
           do_rect = 0;
         }
       }
@@ -4620,8 +4621,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
-                                                   sizeof(*cpi->tile_data)));
+    CHECK_MEM_ERROR(
+        cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index f3c17f255..970077d89 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,7 +50,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }
 
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 }, { 8, 5 },
+  { 10, 6 },
+  { 8, 5 },
 };
 
 // 'num' can be negative, but 'shift' must be non-negative.
@@ -200,9 +201,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
           const int band_next = band_translate[i + 1];
           const int token_next =
               (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
-          unsigned int(
-              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-              token_costs + band_next;
+          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+                                               [ENTROPY_TOKENS] =
+                                                   token_costs + band_next;
           token_cache[rc] = vp9_pt_energy_class[t0];
           ctx_next = get_coef_context(nb, token_cache, i + 1);
           token_tree_sel_next = (x == 0);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1e91d0155..fd3889dda 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,12 +65,12 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
 
 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -547,6 +547,74 @@ static void apply_active_map(VP9_COMP *cpi) {
   }
 }
 
+static void apply_roi_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int *delta_q = roi->delta_q;
+  const int *delta_lf = roi->delta_lf;
+  const int *skip = roi->skip;
+  int ref_frame[8];
+  int internal_delta_q[MAX_SEGMENTS];
+  int i;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+
+  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+  // realtime mode.
+  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+  if (!roi->enabled) return;
+
+  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  // Select delta coding method;
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    // Translate the external delta q values to internal values.
+    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+    if (internal_delta_q[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+    }
+    if (delta_lf[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+    }
+    if (skip[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+    }
+    if (ref_frame[i] >= 0) {
+      int valid_ref = 1;
+      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+        valid_ref = 0;
+      // If GOLDEN is selected, make sure it's set as reference.
+      if (ref_frame[i] == GOLDEN_FRAME &&
+          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+        valid_ref = 0;
+      }
+      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+      // same reference.
+      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+        ref_frame[i] = LAST_FRAME;
+      if (valid_ref) {
+        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+      }
+    }
+  }
+  roi->enabled = 1;
+}
+
 static void init_level_info(Vp9LevelInfo *level_info) {
   Vp9LevelStats *const level_stats = &level_info->level_stats;
   Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -557,6 +625,13 @@ static void init_level_info(Vp9LevelInfo *level_info) {
   level_spec->min_altref_distance = INT_MAX;
 }
 
+static int check_seg_range(int seg_data[8], int range) {
+  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
+           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
+           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
+           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   int i;
   const Vp9LevelSpec *this_level;
@@ -583,6 +658,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]) {
+  VP9_COMMON *cm = &cpi->common;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int range = 63;
+  const int ref_frame_range = 3;  // Alt-ref
+  const int skip_range = 1;
+  const int frame_rows = cpi->common.mi_rows;
+  const int frame_cols = cpi->common.mi_cols;
+
+  // Check number of rows and columns match
+  if (frame_rows != (int)rows || frame_cols != (int)cols) {
+    return -1;
+  }
+
+  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+      !check_seg_range(ref_frame, ref_frame_range) ||
+      !check_seg_range(skip, skip_range))
+    return -1;
+
+  // Also disable segmentation if no deltas are specified.
+  if (!map ||
+      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+         skip[5] | skip[6] | skip[7]) &&
+       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+        ref_frame[6] == -1 && ref_frame[7] == -1))) {
+    vp9_disable_segmentation(&cm->seg);
+    cpi->roi.enabled = 0;
+    return 0;
+  }
+
+  if (roi->roi_map) {
+    vpx_free(roi->roi_map);
+    roi->roi_map = NULL;
+  }
+  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+
+  // Copy to ROI sturcture in the compressor.
+  memcpy(roi->roi_map, map, rows * cols);
+  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+  roi->enabled = 1;
+  roi->rows = rows;
+  roi->cols = cols;
+
+  return 0;
+}
+
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -817,6 +947,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  vpx_free(cpi->roi.roi_map);
+  cpi->roi.roi_map = NULL;
+
   vpx_free(cpi->consec_zero_mv);
   cpi->consec_zero_mv = NULL;
 
@@ -1121,8 +1254,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
+  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+      cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
             &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -2017,8 +2151,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                                sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      cm, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
 
   CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 
@@ -3630,6 +3765,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     // it may be pretty bad for rate-control,
     // and I should handle it somehow
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+  } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) {
+    apply_roi_map(cpi);
   }
 
   apply_active_map(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index d723d93cb..2989af35e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -723,6 +723,8 @@ typedef struct VP9_COMP {
 
   uint8_t *count_arf_frame_usage;
   uint8_t *count_lastgolden_frame_usage;
+
+  vpx_roi_map_t roi;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -868,9 +870,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
-           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+                                                cpi->svc.first_layer_denoise));
 }
 #endif
 
@@ -938,6 +939,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_set_row_mt(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4093b5529..f4fda0965 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -731,9 +731,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   // Exclude any image dead zone
   if (fp_acc_data->image_data_start_row > 0) {
     fp_acc_data->intra_skip_count =
-        VPXMAX(0,
-               fp_acc_data->intra_skip_count -
-                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0, fp_acc_data->intra_skip_count -
+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
   }
 
   fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -2238,9 +2237,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
     }
     gf_group->arf_update_idx[0] = arf_buffer_indices[0];
     gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
-
-    // Step over the golden frame / overlay frame
-    if (EOF == input_stats(twopass, &frame_stats)) return;
   }
 
   // Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -2285,7 +2281,8 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   // Define middle frame
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
-  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  normal_frames =
+      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
   if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
   else
@@ -2551,9 +2548,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Break out conditions.
-    if (
-        // Break at active_max_gf_interval unless almost totally static.
-        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+    // Break at maximum of active_max_gf_interval unless almost totally static.
+    if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
+         (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
         (
             // Don't break out with a very short interval.
             (i >= active_min_gf_interval) &&
@@ -2573,8 +2570,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
-  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval)) {
+  if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
     const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                    ? i - 1
                                    : VPXMAX(0, rc->frames_to_key - i);
@@ -2602,7 +2599,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 
   // Set the interval until the next gf.
-  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  rc->baseline_gf_interval =
+      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
+          ? (i - (is_key_frame || rc->source_alt_ref_pending))
+          : i;
 
   // Only encode alt reference frame in temporal base layer. So
   // baseline_gf_interval should be multiple of a temporal layer group
@@ -2700,13 +2700,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 #endif
 }
 
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
 // Slide show transition detection.
 // Tests for case where there is very low error either side of the current frame
 // but much higher just for this frame. This can help detect key frames in
 // slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
 // It will not help if the transition is a fade or other multi-frame effect.
-static int slide_transition(double this_err, double last_err, double next_err) {
-  return (this_err > (last_err * 5.0)) && (this_err > (next_err * 5.0));
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2753,8 +2764,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
   if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
       (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       (slide_transition(this_frame->coded_error, last_frame->coded_error,
-                         next_frame->coded_error)) ||
+       (slide_transition(this_frame, last_frame, next_frame)) ||
        ((pcnt_intra > MIN_INTRA_LEVEL) &&
         (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
         ((this_frame->intra_error /
@@ -3019,8 +3029,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       double zm_factor;
 
       // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      // First frame in kf group the second ref indicator is invalid.
+      if (i > 0) {
+        zero_motion_accumulator = VPXMIN(
+            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      } else {
+        zero_motion_accumulator =
+            next_frame.pcnt_inter - next_frame.pcnt_motion;
+      }
 
       // Factor 0.75-1.25 based on how much of frame is static.
       zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3056,10 +3072,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Apply various clamps for min and max boost
-  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  // Special case for static / slide show content but dont apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+    rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
+  } else {
+    // Apply various clamps for min and max boost
+    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  }
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 000ecd779..aa497e3da 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
 typedef struct {
   unsigned char index;
   unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
-  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
 } GF_GROUP;
 
 typedef struct {
diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h
index df2fb98ef..c3af972bc 100644
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -25,7 +25,9 @@ typedef struct {
   } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;
 
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
 
 struct VP9_COMP;
 
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 44f01be25..37406c232 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1785,7 +1785,10 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }
 
 static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
 };
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1876,7 +1879,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
 
   {
     const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
     };
 
     cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f2f323a28..212c260fa 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1488,7 +1488,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int skip_ref_find_pred[4] = { 0 };
   unsigned int sse_zeromv_normalized = UINT_MAX;
   unsigned int best_sse_sofar = UINT_MAX;
-  unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
@@ -1496,11 +1495,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif
   INTERP_FILTER filter_gf_svc = EIGHTTAP;
   MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+  const struct segmentation *const seg = &cm->seg;
   int comp_modes = 0;
   int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
   int flag_svc_subpel = 0;
   int svc_mv_col = 0;
   int svc_mv_row = 0;
+  unsigned int thresh_svc_skip_golden = 500;
+  // Lower the skip threshold if lower spatial layer is better quality relative
+  // to current layer.
+  if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
+      cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
+    thresh_svc_skip_golden = 100;
+  // Increase skip threshold if lower spatial layer is lower quality relative
+  // to current layer.
+  else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
+           cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
+    thresh_svc_skip_golden = 1000;
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
 
@@ -1638,6 +1649,16 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
     comp_modes = 2;
 
+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+    thresh_svc_skip_golden = 0;
+  }
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
@@ -1699,6 +1720,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (ref_frame > usable_ref_frame) continue;
     if (skip_ref_find_pred[ref_frame]) continue;
 
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
     if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
       force_gf_mv = 1;
       // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
@@ -1713,7 +1740,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
     if (comp_pred) {
-      const struct segmentation *const seg = &cm->seg;
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1785,29 +1811,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         continue;
     }
 
-    if (sf->reference_masking &&
-        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-          ref_frame == LAST_FRAME)) {
-      if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
-          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-          if ((cpi->ref_frame_flags & flag_list[i]))
-            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-              ref_frame_skip_mask |= (1 << ref_frame);
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we end
+    // up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      if (sf->reference_masking &&
+          !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+            ref_frame == LAST_FRAME)) {
+        if (usable_ref_frame < ALTREF_FRAME) {
+          if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+            i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+            if ((cpi->ref_frame_flags & flag_list[i]))
+              if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+                ref_frame_skip_mask |= (1 << ref_frame);
+          }
+        } else if (!cpi->rc.is_src_frame_alt_ref &&
+                   !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                     ref_frame == ALTREF_FRAME)) {
+          int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+          int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+            ref_frame_skip_mask |= (1 << ref_frame);
         }
-      } else if (!cpi->rc.is_src_frame_alt_ref &&
-                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-                   ref_frame == ALTREF_FRAME)) {
-        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
-        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-            ((cpi->ref_frame_flags & flag_list[ref2]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
-          ref_frame_skip_mask |= (1 << ref_frame);
       }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
     }
-    if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -2202,9 +2233,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference or is_key_frame is set.
+  // LAST is the only reference, or is_key_frame is set, or on base
+  // temporal layer.
   if (cpi->svc.spatial_layer_id) {
     perform_intra_pred =
+        cpi->svc.temporal_layer_id == 0 ||
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
         !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
         (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -2214,6 +2247,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
       cpi->rc.is_src_frame_alt_ref)
     perform_intra_pred = 0;
+
+  // If the segment reference frame feature is enabled and set then
+  // skip the intra prediction.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+    perform_intra_pred = 0;
+
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if (best_rdc.rdcost == INT64_MAX ||
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 4a7ccc4e5..2e4c9d2bf 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -31,10 +31,13 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
-// Max rate target for 1080P and below encodes under normal circumstances
-// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 2025000
+#define MAXRATE_1080P 4000000
 
 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
@@ -1100,6 +1103,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
       // Baseline value derived from cpi->active_worst_quality and kf boost.
       active_best_quality =
           get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+      if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+        active_best_quality /= 4;
+      }
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1490,6 +1496,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
 
   rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+  if (cpi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+    cpi->svc.lower_layer_qindex = cm->base_qindex;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1584,9 +1593,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
       // between 0 and 100 (stationary, 100% zero/small motion).
       rc->gfu_boost =
-          VPXMAX(500,
-                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                     (rc->avg_frame_low_motion + 100));
+          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                          (rc->avg_frame_low_motion + 100));
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1861,13 +1869,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
 
-    // Extended interval for genuinely static scenes
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
-
-    if (is_altref_enabled(cpi)) {
-      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-    }
+    // Extended max interval for genuinely static scenes like slide shows.
+    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
 
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1911,12 +1914,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
       VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
 
   // A maximum bitrate for a frame is defined.
-  // The baseline for this aligns with HW implementations that
-  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
-  // per 16x16 MB (averaged over a frame). However this limit is extended if
-  // a very high rate is given on the command line or the the rate cannnot
-  // be acheived because of a user specificed max q (e.g. when the user
-  // specifies lossless encode.
+  // However this limit is extended if a very high rate is given on the command
+  // line or the the rate cannnot be acheived because of a user specificed max q
+  // (e.g. when the user specifies lossless encode).
+  //
+  // If a level is specified that requires a lower maximum rate then the level
+  // value take precedence.
   vbr_max_bits =
       (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
             100);
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index c1b210677..3a40e0138 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,6 +34,14 @@ extern "C" {
 
 #define FRAME_OVERHEAD_BITS 200
 
+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2ba6378c5..90f06720b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -59,7 +59,9 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
 
 struct rdcost_block_args {
   const VP9_COMP *cpi;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index eb39bab25..389d48f21 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -45,8 +45,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     svc->ext_lst_fb_idx[sl] = 0;
     svc->ext_gld_fb_idx[sl] = 1;
     svc->ext_alt_fb_idx[sl] = 2;
-    svc->downsample_filter_type[sl] = EIGHTTAP;
-    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
   }
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -155,6 +155,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
   int sl, tl, layer = 0, spatial_layer_target;
   float bitrate_alloc = 1.0;
 
+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
     for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
       for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -547,6 +549,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     if (!spatial_id) {
       cpi->ref_frame_flags = VP9_LAST_FLAG;
     } else {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1)
+        cpi->ext_refresh_alt_ref_frame = 0;
       cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
     }
   }
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 8b708e271..87686fe59 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -103,6 +103,8 @@ typedef struct SVC {
   int first_layer_denoise;
 
   int skip_enhancement_layer;
+
+  int lower_layer_qindex;
 } SVC;
 
 struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index dbd243ac1..293cdcd67 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    default: assert(0); break;
   }
 }
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 000000000..4bebc34d6
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)(a), zero);
+  _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+  _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+                                   __m256i *coeff256) {
+  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+  const __m256i zero256 = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
+  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
+  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+#else
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+#endif
+  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+  // Add one to convert from indices to counts
+  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *round_ptr,
+                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan_ptr,
+                          const int16_t *iscan_ptr) {
+  __m128i eob;
+  __m256i round256, quant256, dequant256;
+  __m256i eob256, thr256;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  {
+    __m256i coeff256;
+
+    // Setup global values
+    {
+      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      round256 = _mm256_castsi128_si256(round);
+      round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+      quant256 = _mm256_castsi128_si256(quant);
+      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+      dequant256 = _mm256_castsi128_si256(dequant);
+      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+    }
+
+    {
+      __m256i qcoeff256;
+      __m256i qtmp256;
+      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+      qcoeff256 = _mm256_abs_epi16(coeff256);
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+    }
+
+    eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
+    n_coeffs += 8 * 2;
+  }
+
+  // remove dc constants
+  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+  thr256 = _mm256_srai_epi16(dequant256, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+    int32_t nzflag =
+        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+    if (nzflag) {
+      __m256i qtmp256;
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+      eob256 = _mm256_max_epi16(
+          eob256,
+          scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+                      _mm256_extracti128_si256(eob256, 1));
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9819fb641..721e170bf 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -64,10 +64,13 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 
-VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_idct4x4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_idct8x8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_idct16x16_msa.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct16x16_msa.c
+VP9_COMMON_SRCS-$(HAVE_SSE2)  += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht_neon.h
 
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_mfqe_msa.c
@@ -78,10 +81,11 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
 else
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
 endif
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5bfe9aa05..40f7ab531 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1067,12 +1067,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
   vpx_codec_frame_flags_t flags = lib_flags << 16;
 
   if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc &&
-       cpi->svc
-           .layer_context[cpi->svc.spatial_layer_id *
-                              cpi->svc.number_temporal_layers +
-                          cpi->svc.temporal_layer_id]
-           .is_key_frame))
+      (cpi->use_svc && cpi->svc
+                           .layer_context[cpi->svc.spatial_layer_id *
+                                              cpi->svc.number_temporal_layers +
+                                          cpi->svc.temporal_layer_id]
+                           .is_key_frame))
     flags |= VPX_FRAME_IS_KEY;
 
   if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
@@ -1234,6 +1233,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           ctx->pending_frame_magnitude |= size;
           cx_data += size;
           cx_data_sz -= size;
+          pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+          pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
 
           if (ctx->output_cx_pkt_cb.output_cx_pkt) {
             pkt.kind = VPX_CODEC_CX_FRAME_PKT;
@@ -1260,8 +1261,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
         pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
             timebase, dst_end_time_stamp - dst_time_stamp);
         pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
-        pkt.data.frame.width = cpi->common.width;
-        pkt.data.frame.height = cpi->common.height;
+        pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+        pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
 
         if (ctx->pending_cx_data) {
           if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -1340,9 +1341,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
@@ -1356,9 +1356,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
@@ -1371,9 +1370,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
 
     yuvconfig2image(&frame->img, fb, NULL);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
@@ -1383,9 +1381,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
   if (config != NULL) {
     ctx->preview_ppcfg = *config;
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 #else
   (void)ctx;
   (void)args;
@@ -1407,17 +1404,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
   if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else {
-    return NULL;
   }
+  return NULL;
 }
 
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
-  (void)ctx;
-  (void)args;
+  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
 
-  // TODO(yaowu): Need to re-implement and test for VP9.
+  if (data) {
+    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                         roi->delta_q, roi->delta_lf, roi->skip,
+                         roi->ref_frame)) {
+      return VPX_CODEC_OK;
+    }
+    return VPX_CODEC_INVALID_PARAM;
+  }
   return VPX_CODEC_INVALID_PARAM;
 }
 
@@ -1429,11 +1433,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
@@ -1444,11 +1447,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
@@ -1460,9 +1462,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
         vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
                               (VPX_SCALING)mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -1608,7 +1609,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   // Setters
   { VP8_SET_REFERENCE, ctrl_set_reference },
   { VP8_SET_POSTPROC, ctrl_set_previewpp },
-  { VP8E_SET_ROI_MAP, ctrl_set_roi_map },
+  { VP9E_SET_ROI_MAP, ctrl_set_roi_map },
   { VP8E_SET_ACTIVEMAP, ctrl_set_active_map },
   { VP8E_SET_SCALEMODE, ctrl_set_scale_mode },
   { VP8E_SET_CPUUSED, ctrl_set_cpuused },
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index d633ed142..6186d4614 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -103,6 +103,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 4390cf7c8..1cf2dca69 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */
+#include <assert.h>
 #include <limits.h>
+#include <stdlib.h>
 #include <string.h>
+#include "vp8/common/blockd.h"
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
@@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
     int i;
     void *mem_loc = NULL;
 
+    if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
+
     if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
       for (i = 0; i < num_enc; i++) {
         vpx_codec_priv_enc_mr_cfg_t mr_cfg;
@@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
         if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
             dsf->den > dsf->num) {
           res = VPX_CODEC_INVALID_PARAM;
-          break;
+        } else {
+          mr_cfg.mr_low_res_mode_info = mem_loc;
+          mr_cfg.mr_total_resolutions = num_enc;
+          mr_cfg.mr_encoder_id = num_enc - 1 - i;
+          mr_cfg.mr_down_sampling_factor.num = dsf->num;
+          mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+          /* Force Key-frame synchronization. Namely, encoder at higher
+           * resolution always use the same frame_type chosen by the
+           * lowest-resolution encoder.
+           */
+          if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+          ctx->iface = iface;
+          ctx->name = iface->name;
+          ctx->priv = NULL;
+          ctx->init_flags = flags;
+          ctx->config.enc = cfg;
+          res = ctx->iface->init(ctx, &mr_cfg);
         }
 
-        mr_cfg.mr_low_res_mode_info = mem_loc;
-        mr_cfg.mr_total_resolutions = num_enc;
-        mr_cfg.mr_encoder_id = num_enc - 1 - i;
-        mr_cfg.mr_down_sampling_factor.num = dsf->num;
-        mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-        /* Force Key-frame synchronization. Namely, encoder at higher
-         * resolution always use the same frame_type chosen by the
-         * lowest-resolution encoder.
-         */
-        if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
-        ctx->iface = iface;
-        ctx->name = iface->name;
-        ctx->priv = NULL;
-        ctx->init_flags = flags;
-        ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx, &mr_cfg);
-
         if (res) {
           const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
           /* Destroy current ctx */
@@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             vpx_codec_destroy(ctx);
             i--;
           }
+#if CONFIG_MULTI_RES_ENCODING
+          assert(mem_loc);
+          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+          free(mem_loc);
+#endif
+          return SAVE_STATUS(ctx, res);
         }
 
-        if (res) break;
-
         ctx++;
         cfg++;
         dsf++;
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 6c67ec24f..261ed8614 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -125,7 +125,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 enum vp8e_enc_control_id {
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
-   * Supported in codecs: VP8, VP9
+   * Supported in codecs: VP8
    */
   VP8E_SET_ROI_MAP = 8,
 
@@ -423,6 +423,12 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_SVC,
 
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_ROI_MAP,
+
   /*!\brief Codec control function to set parameters for SVC.
    * \note Parameters contain min_q, max_q, scaling factor for each of the
    *       SVC layers.
@@ -643,16 +649,20 @@ typedef enum vp9e_temporal_layering_mode {
  */
 
 typedef struct vpx_roi_map {
-  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  /*! If ROI is enabled. */
+  uint8_t enabled;
+  /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9)
+   * region within a frame. */
   unsigned char *roi_map;
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
-  // TODO(paulwilkins): broken for VP9 which has 8 segments
-  // q and loop filter deltas for each segment
-  // (see MAX_MB_SEGMENTS)
-  int delta_q[4];  /**< Quantizer deltas. */
-  int delta_lf[4]; /**< Loop filter deltas. */
-  /*! Static breakout threshold for each segment. */
+  /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
+  int delta_q[8];  /**< Quantizer deltas. */
+  int delta_lf[8]; /**< Loop filter deltas. */
+  /*! skip and ref frame segment is only used in VP9. */
+  int skip[8];      /**< Skip this block. */
+  int ref_frame[8]; /**< Reference frame for this block. */
+  /*! Static breakout threshold for each segment. Only used in VP8. */
   unsigned int static_threshold[4];
 } vpx_roi_map_t;
 
@@ -749,6 +759,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
 #define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 41c53e48d..398c67022 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -179,6 +179,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
 #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
 VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index de964eb46..f9a2ed374 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (7 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (8 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -182,8 +182,10 @@ typedef struct vpx_codec_cx_pkt {
        * Only applicable when "output partition" mode is enabled. First
        * partition has id 0.*/
       int partition_id;
-      unsigned int width;               /**< frame width */
-      unsigned int height;              /**< frame height */
+      /*!\brief Width and height of frames in this packet. VP8 will only use the
+       * first one.*/
+      unsigned int width[VPX_SS_MAX_LAYERS];  /**< frame width */
+      unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */
     } frame;                            /**< data for compressed frame packet */
     vpx_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
     vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h
index ad70cdd57..ac17c529d 100644
--- a/vpx/vpx_frame_buffer.h
+++ b/vpx/vpx_frame_buffer.h
@@ -57,7 +57,7 @@ typedef struct vpx_codec_frame_buffer {
  * return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
- * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in] min_size     Size in bytes needed by the buffer
  * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
  */
 typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 5358839b5..3fa2f9e28 100644
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -14,58 +14,33 @@
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t,
-                                                     int32x4x2_t *const d0,
-                                                     int32x4x2_t *const d1) {
-  int32x2x2_t t32[4];
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
-  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
-  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
-  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
-  d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
-  d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
-  d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]);
+static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
+  int32x2x2_t t32;
+
+  t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
+  t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
+  return vcombine_s32(t32.val[0], t32.val[1]);
 }
 
-static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t,
-                                                     int32x4_t *const d0,
-                                                     int32x4_t *const d1) {
-  int32x2x2_t t32[2];
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+static INLINE void dct_const_round_shift_high_4_dual(
+    const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
+  *d0 = dct_const_round_shift_high_4(in[0]);
+  *d1 = dct_const_round_shift_high_4(in[1]);
 }
 
 static INLINE int32x4x2_t
-highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) {
-  int32x2x2_t t32[2];
-  int32x4x2_t d;
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
-  return d;
+dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = dct_const_round_shift_high_4(in[0]);
+  out.val[1] = dct_const_round_shift_high_4(in[1]);
+  return out;
 }
 
-static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) {
-  int32x2x2_t t32;
-
-  t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS);
-  t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS);
-  return vcombine_s32(t32.val[0], t32.val[1]);
+static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
+                                                    int32x4x2_t *const d0,
+                                                    int32x4x2_t *const d1) {
+  *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
+  *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
 }
 
 static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
@@ -107,7 +82,7 @@ static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
                                vget_low_s32(cospi_2_30_10_22), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_2_30_10_22), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
@@ -149,7 +124,7 @@ static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
                                vget_low_s32(cospi_4_12_20N_28), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_4_12_20N_28), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
@@ -191,7 +166,7 @@ static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
                                vget_low_s32(cospi_6_26N_14_18N), 1);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_6_26N_14_18N), 1);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
@@ -233,7 +208,7 @@ static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
                                vget_high_s32(cospi_2_30_10_22), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_2_30_10_22), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
@@ -275,7 +250,7 @@ static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
                                vget_high_s32(cospi_4_12_20N_28), 0);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_4_12_20N_28), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
@@ -317,7 +292,7 @@ static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
                                vget_high_s32(cospi_6_26N_14_18N), 1);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_6_26N_14_18N), 1);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_q_kernel(
@@ -386,7 +361,7 @@ static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
   int64x2x2_t t[4];
 
   highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
@@ -397,7 +372,7 @@ static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
   int64x2x2_t t[2];
 
   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
@@ -412,7 +387,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
   t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
   t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
   t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
@@ -425,7 +400,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
   t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
   t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
@@ -459,7 +434,7 @@ static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
                                vget_high_s32(cospi_0_8_16_24), 0);
   t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_0_8_16_24), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
@@ -481,7 +456,7 @@ static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
                                vget_high_s32(cospi_0_8_16_24), 0);
   t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
                                vget_high_s32(cospi_0_8_16_24), 0);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct16x16_add_stage7_dual(
@@ -815,7 +790,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
-  return highbd_idct16x16_add_wrap_low_8x1(t);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
 }
 
 static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
@@ -824,7 +799,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
 
   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
-  return highbd_idct16x16_add_wrap_low_4x1(t);
+  return dct_const_round_shift_high_4(t);
 }
 
 static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
@@ -835,7 +810,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
-  return highbd_idct16x16_add_wrap_low_8x1(t);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
 }
 
 static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
@@ -844,7 +819,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
 
   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
-  return highbd_idct16x16_add_wrap_low_4x1(t);
+  return dct_const_round_shift_high_4(t);
 }
 
 static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
diff --git a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
index 96a55c472..5b36f7336 100644
--- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -124,83 +124,77 @@ static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
                                vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
 }
 
-static INLINE void load_s32x4q_dual(
-    const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1,
-    int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4,
-    int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) {
-  s0->val[0] = vld1q_s32(in);
-  s0->val[1] = vld1q_s32(in + 4);
+static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
+  s[0].val[0] = vld1q_s32(in);
+  s[0].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s1->val[0] = vld1q_s32(in);
-  s1->val[1] = vld1q_s32(in + 4);
+  s[1].val[0] = vld1q_s32(in);
+  s[1].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s2->val[0] = vld1q_s32(in);
-  s2->val[1] = vld1q_s32(in + 4);
+  s[2].val[0] = vld1q_s32(in);
+  s[2].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s3->val[0] = vld1q_s32(in);
-  s3->val[1] = vld1q_s32(in + 4);
+  s[3].val[0] = vld1q_s32(in);
+  s[3].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s4->val[0] = vld1q_s32(in);
-  s4->val[1] = vld1q_s32(in + 4);
+  s[4].val[0] = vld1q_s32(in);
+  s[4].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s5->val[0] = vld1q_s32(in);
-  s5->val[1] = vld1q_s32(in + 4);
+  s[5].val[0] = vld1q_s32(in);
+  s[5].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s6->val[0] = vld1q_s32(in);
-  s6->val[1] = vld1q_s32(in + 4);
+  s[6].val[0] = vld1q_s32(in);
+  s[6].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s7->val[0] = vld1q_s32(in);
-  s7->val[1] = vld1q_s32(in + 4);
+  s[7].val[0] = vld1q_s32(in);
+  s[7].val[1] = vld1q_s32(in + 4);
 }
 
-static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1,
-                                               int32x4x2_t a2, int32x4x2_t a3,
-                                               int32x4x2_t a4, int32x4x2_t a5,
-                                               int32x4x2_t a6, int32x4x2_t a7,
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
                                                int32_t **out) {
-  transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
 
-  vst1q_s32(*out, a0.val[0]);
+  vst1q_s32(*out, a[0].val[0]);
   *out += 4;
-  vst1q_s32(*out, a0.val[1]);
+  vst1q_s32(*out, a[0].val[1]);
   *out += 4;
-  vst1q_s32(*out, a1.val[0]);
+  vst1q_s32(*out, a[1].val[0]);
   *out += 4;
-  vst1q_s32(*out, a1.val[1]);
+  vst1q_s32(*out, a[1].val[1]);
   *out += 4;
-  vst1q_s32(*out, a2.val[0]);
+  vst1q_s32(*out, a[2].val[0]);
   *out += 4;
-  vst1q_s32(*out, a2.val[1]);
+  vst1q_s32(*out, a[2].val[1]);
   *out += 4;
-  vst1q_s32(*out, a3.val[0]);
+  vst1q_s32(*out, a[3].val[0]);
   *out += 4;
-  vst1q_s32(*out, a3.val[1]);
+  vst1q_s32(*out, a[3].val[1]);
   *out += 4;
-  vst1q_s32(*out, a4.val[0]);
+  vst1q_s32(*out, a[4].val[0]);
   *out += 4;
-  vst1q_s32(*out, a4.val[1]);
+  vst1q_s32(*out, a[4].val[1]);
   *out += 4;
-  vst1q_s32(*out, a5.val[0]);
+  vst1q_s32(*out, a[5].val[0]);
   *out += 4;
-  vst1q_s32(*out, a5.val[1]);
+  vst1q_s32(*out, a[5].val[1]);
   *out += 4;
-  vst1q_s32(*out, a6.val[0]);
+  vst1q_s32(*out, a[6].val[0]);
   *out += 4;
-  vst1q_s32(*out, a6.val[1]);
+  vst1q_s32(*out, a[6].val[1]);
   *out += 4;
-  vst1q_s32(*out, a7.val[0]);
+  vst1q_s32(*out, a[7].val[0]);
   *out += 4;
-  vst1q_s32(*out, a7.val[1]);
+  vst1q_s32(*out, a[7].val[1]);
   *out += 4;
 }
 
 static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
   int i;
-  int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int32x4x2_t s[8];
 
   for (i = 0; i < 4; i++, input += 8) {
-    load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-    transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+    load_s32x4q_dual(input, s);
+    transpose_and_store_s32_8x8(s, &t_buf);
   }
 }
 
diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 1418a75a1..7be1dad1d 100644
--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -11,27 +11,10 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
-                                                const int stride,
-                                                const int16x8_t res,
-                                                const int16x8_t max) {
-  const uint16x4_t a0 = vld1_u16(*dest);
-  const uint16x4_t a1 = vld1_u16(*dest + stride);
-  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
-  // Note: In some profile tests, res is quite close to +/-32767.
-  // We use saturating addition.
-  const int16x8_t b = vqaddq_s16(res, a);
-  const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1_u16(*dest, vget_low_u16(d));
-  *dest += stride;
-  vst1_u16(*dest, vget_high_u16(d));
-  *dest += stride;
-}
-
 // res is in reverse row order
 static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
                                                 const int stride,
@@ -65,109 +48,42 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
 }
 
-static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
-  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
-  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
-  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
-  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
-  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
-  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
-  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
-  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
-  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
-static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
-  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
-  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
-  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
-  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
-  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
-  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
-  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
-  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
-  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
-  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
-  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
-  c4 = vsubq_s64(c4, c8);
-  c5 = vsubq_s64(c5, c9);
-  c6 = vaddq_s64(c6, c10);
-  c7 = vaddq_s64(c7, c11);
-  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
-                    vrshrn_n_s64(c1, DCT_CONST_BITS));
-  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
-                    vrshrn_n_s64(c3, DCT_CONST_BITS));
-  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
-                    vrshrn_n_s64(c5, DCT_CONST_BITS));
-  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
-                    vrshrn_n_s64(c7, DCT_CONST_BITS));
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
 void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  int32x4_t c0 = vld1q_s32(input);
-  int32x4_t c1 = vld1q_s32(input + 4);
-  int32x4_t c2 = vld1q_s32(input + 8);
-  int32x4_t c3 = vld1q_s32(input + 12);
-  int16x8_t a0, a1;
+  int16x8_t a[2];
+  int32x4_t c[4];
 
-  if (bd == 8) {
-    const int16x4_t cospis = vld1_s16(kCospi);
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
 
+  if (bd == 8) {
     // Rows
-    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
-    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_idct4x4_16_bd8(a);
 
     // Columns
-    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-    a0 = vrshrq_n_s16(a0, 4);
-    a1 = vrshrq_n_s16(a1, 4);
+    a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+    transpose_idct4x4_16_bd8(a);
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
   } else {
     const int32x4_t cospis = vld1q_s32(kCospi32);
 
     if (bd == 10) {
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, c);
+      idct4x4_16_kernel_bd10(cospis, c);
     } else {
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, c);
+      idct4x4_16_kernel_bd12(cospis, c);
     }
-    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
-    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4));
   }
 
-  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
-  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max);
 }
diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index dd90134a6..e51e574cc 100644
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -127,7 +127,7 @@ static INLINE void idct8x8_12_half1d_bd12(
     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
     int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h;
+  int32x2_t input1l, input1h, input3l, input3h;
   int32x2_t step1l[2], step1h[2];
   int32x4_t step1[8], step2[8];
   int64x2_t t64[8];
@@ -136,23 +136,23 @@ static INLINE void idct8x8_12_half1d_bd12(
   transpose_s32_4x4(io0, io1, io2, io3);
 
   // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
   step1l[0] = vget_low_s32(*io0);
   step1h[0] = vget_high_s32(*io0);
   step1l[1] = vget_low_s32(*io2);
   step1h[1] = vget_high_s32(*io2);
 
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
@@ -222,9 +222,7 @@ static INLINE void idct8x8_12_half1d_bd12(
   *io7 = vsubq_s32(step1[0], step2[7]);
 }
 
-static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                                 int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                                 int16x8_t a6, int16x8_t a7, uint16_t *dest,
+static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
                                  const int stride, const int bd) {
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
   const uint16_t *dst = dest;
@@ -248,14 +246,14 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
   dst += stride;
   d7 = vld1q_u16(dst);
 
-  d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
-  d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
-  d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
-  d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
-  d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
-  d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
-  d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
-  d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
+  d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
+  d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
+  d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
+  d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
+  d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
+  d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
+  d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
+  d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
 
   d0_s16 = vminq_s16(d0_s16, max);
   d1_s16 = vminq_s16(d1_s16, max);
@@ -293,11 +291,13 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
 
 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 8);
-  int32x4_t a2 = vld1q_s32(input + 16);
-  int32x4_t a3 = vld1q_s32(input + 24);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 8);
+  a[2] = vld1q_s32(input + 16);
+  a[3] = vld1q_s32(input + 24);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
@@ -305,54 +305,52 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
     const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
     const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
     const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-    int16x4_t b0 = vmovn_s32(a0);
-    int16x4_t b1 = vmovn_s32(a1);
-    int16x4_t b2 = vmovn_s32(a2);
-    int16x4_t b3 = vmovn_s32(a3);
-    int16x4_t b4, b5, b6, b7;
-
-    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
-                         &b5, &b6, &b7);
-    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
-                         b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
-    c0 = vrshrq_n_s16(c0, 5);
-    c1 = vrshrq_n_s16(c1, 5);
-    c2 = vrshrq_n_s16(c2, 5);
-    c3 = vrshrq_n_s16(c3, 5);
-    c4 = vrshrq_n_s16(c4, 5);
-    c5 = vrshrq_n_s16(c5, 5);
-    c6 = vrshrq_n_s16(c6, 5);
-    c7 = vrshrq_n_s16(c7, 5);
+    int16x4_t b[8];
+
+    b[0] = vmovn_s32(a[0]);
+    b[1] = vmovn_s32(a[1]);
+    b[2] = vmovn_s32(a[2]);
+    b[3] = vmovn_s32(a[3]);
+
+    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
+    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
+    c[0] = vrshrq_n_s16(c[0], 5);
+    c[1] = vrshrq_n_s16(c[1], 5);
+    c[2] = vrshrq_n_s16(c[2], 5);
+    c[3] = vrshrq_n_s16(c[3], 5);
+    c[4] = vrshrq_n_s16(c[4], 5);
+    c[5] = vrshrq_n_s16(c[5], 5);
+    c[6] = vrshrq_n_s16(c[6], 5);
+    c[7] = vrshrq_n_s16(c[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
-    int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
 
     if (bd == 10) {
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     } else {
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+  highbd_add8x8(c, dest, stride, bd);
 }
 
 static INLINE void idct8x8_64_half1d_bd10(
@@ -428,8 +426,8 @@ static INLINE void idct8x8_64_half1d_bd12(
     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
     int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
+  int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
   int32x2_t step1l[4], step1h[4];
   int32x4_t step1[8], step2[8];
   int64x2_t t64[8];
@@ -438,14 +436,14 @@ static INLINE void idct8x8_64_half1d_bd12(
   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
 
   // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
-  input_5l = vget_low_s32(*io5);
-  input_5h = vget_high_s32(*io5);
-  input_7l = vget_low_s32(*io7);
-  input_7h = vget_high_s32(*io7);
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
+  input5l = vget_low_s32(*io5);
+  input5h = vget_high_s32(*io5);
+  input7l = vget_low_s32(*io7);
+  input7h = vget_high_s32(*io7);
   step1l[0] = vget_low_s32(*io0);
   step1h[0] = vget_high_s32(*io0);
   step1l[1] = vget_low_s32(*io2);
@@ -455,22 +453,22 @@ static INLINE void idct8x8_64_half1d_bd12(
   step1l[3] = vget_low_s32(*io6);
   step1h[3] = vget_high_s32(*io6);
 
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
-  t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
-  t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
-  t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
-  t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
-  t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
-  t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
-  t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
+  t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
+  t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
+  t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
+  t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
+  t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
+  t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
+  t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
@@ -553,79 +551,83 @@ static INLINE void idct8x8_64_half1d_bd12(
 
 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 4);
-  int32x4_t a2 = vld1q_s32(input + 8);
-  int32x4_t a3 = vld1q_s32(input + 12);
-  int32x4_t a4 = vld1q_s32(input + 16);
-  int32x4_t a5 = vld1q_s32(input + 20);
-  int32x4_t a6 = vld1q_s32(input + 24);
-  int32x4_t a7 = vld1q_s32(input + 28);
-  int32x4_t a8 = vld1q_s32(input + 32);
-  int32x4_t a9 = vld1q_s32(input + 36);
-  int32x4_t a10 = vld1q_s32(input + 40);
-  int32x4_t a11 = vld1q_s32(input + 44);
-  int32x4_t a12 = vld1q_s32(input + 48);
-  int32x4_t a13 = vld1q_s32(input + 52);
-  int32x4_t a14 = vld1q_s32(input + 56);
-  int32x4_t a15 = vld1q_s32(input + 60);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 4);
+  a[2] = vld1q_s32(input + 8);
+  a[3] = vld1q_s32(input + 12);
+  a[4] = vld1q_s32(input + 16);
+  a[5] = vld1q_s32(input + 20);
+  a[6] = vld1q_s32(input + 24);
+  a[7] = vld1q_s32(input + 28);
+  a[8] = vld1q_s32(input + 32);
+  a[9] = vld1q_s32(input + 36);
+  a[10] = vld1q_s32(input + 40);
+  a[11] = vld1q_s32(input + 44);
+  a[12] = vld1q_s32(input + 48);
+  a[13] = vld1q_s32(input + 52);
+  a[14] = vld1q_s32(input + 56);
+  a[15] = vld1q_s32(input + 60);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
     const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
     const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-    int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
-    int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
-    int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
-    int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
-    int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
-    int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
-    int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
-    int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
-
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-
-    c0 = vrshrq_n_s16(b0, 5);
-    c1 = vrshrq_n_s16(b1, 5);
-    c2 = vrshrq_n_s16(b2, 5);
-    c3 = vrshrq_n_s16(b3, 5);
-    c4 = vrshrq_n_s16(b4, 5);
-    c5 = vrshrq_n_s16(b5, 5);
-    c6 = vrshrq_n_s16(b6, 5);
-    c7 = vrshrq_n_s16(b7, 5);
+    int16x8_t b[8];
+
+    b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+    b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+    b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+    b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+    b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+    b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+    b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+    b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+
+    c[0] = vrshrq_n_s16(b[0], 5);
+    c[1] = vrshrq_n_s16(b[1], 5);
+    c[2] = vrshrq_n_s16(b[2], 5);
+    c[3] = vrshrq_n_s16(b[3], 5);
+    c[4] = vrshrq_n_s16(b[4], 5);
+    c[5] = vrshrq_n_s16(b[5], 5);
+    c[6] = vrshrq_n_s16(b[6], 5);
+    c[7] = vrshrq_n_s16(b[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 
     if (bd == 10) {
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     } else {
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+  highbd_add8x8(c, dest, stride, bd);
 }
diff --git a/vpx_dsp/arm/highbd_idct_neon.h b/vpx_dsp/arm/highbd_idct_neon.h
new file mode 100644
index 000000000..92fcb7f3a
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct_neon.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+#define VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c[12];
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
+  c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
+  c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
+  c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
+  c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
+  c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
+  c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
+  c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
+  c[4] = vsubq_s64(c[4], c[8]);
+  c[5] = vsubq_s64(c[5], c[9]);
+  c[6] = vaddq_s64(c[6], c[10]);
+  c[7] = vaddq_s64(c[7], c[11]);
+  b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[5], DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[7], DCT_CONST_BITS));
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+#endif  // VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c
index 021211bc9..057731ad9 100644
--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -650,14 +650,10 @@ void vpx_idct32_16_neon(const int16_t *const input, void *const output,
     highbd_add_and_store_bd8(out, output, stride);
   } else {
     uint8_t *const outputT = (uint8_t *)output;
-    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], outputT, stride);
-    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], outputT + (8 * stride), stride);
-    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], outputT + (16 * stride), stride);
-    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], outputT + (24 * stride), stride);
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
   }
 }
 
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
index f3c336fa3..f570547e4 100644
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -490,14 +490,10 @@ void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
     highbd_add_and_store_bd8(out, output, stride);
   } else {
     uint8_t *const outputT = (uint8_t *)output;
-    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], outputT, stride);
-    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], outputT + (8 * stride), stride);
-    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], outputT + (16 * stride), stride);
-    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], outputT + (24 * stride), stride);
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
   }
 }
 
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 673a36840..8192ee4cf 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -19,44 +19,41 @@
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const uint8_t *dst = dest;
-  const int16x4_t cospis = vld1_s16(kCospi);
-  uint8x8_t dest01_u8;
-  uint32x2_t dest32_u32 = vdup_n_u32(0);
-  int16x8_t a0, a1;
-  uint8x8_t d01, d32;
-  uint16x8_t d01_u16, d32_u16;
+  uint32x2_t s32 = vdup_n_u32(0);
+  int16x8_t a[2];
+  uint8x8_t s, d[2];
+  uint16x8_t sum[2];
 
   assert(!((intptr_t)dest % sizeof(uint32_t)));
   assert(!(stride % sizeof(uint32_t)));
 
   // Rows
-  a0 = load_tran_low_to_s16q(input);
-  a1 = load_tran_low_to_s16q(input + 8);
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_idct4x4_16_bd8(a);
 
   // Columns
-  a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-  a0 = vrshrq_n_s16(a0, 4);
-  a1 = vrshrq_n_s16(a1, 4);
+  a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+  transpose_idct4x4_16_bd8(a);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
 
-  dest01_u8 = load_u8(dst, stride);
+  s = load_u8(dst, stride);
   dst += 2 * stride;
   // The elements are loaded in reverse order.
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1);
   dst += stride;
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0);
 
-  d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
-  d32_u16 =
-      vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
-  d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
-  d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32));
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
 
-  store_u8(dest, stride, d01);
+  store_u8(dest, stride, d[0]);
   dest += 2 * stride;
   // The elements are stored in reverse order.
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1);
   dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0);
 }
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index 1121ade27..7471387e4 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -17,91 +17,25 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                          int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                          int16x8_t a6, int16x8_t a7, uint8_t *dest,
-                          const int stride) {
-  const uint8_t *dst = dest;
-  uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
-
-  a0 = vrshrq_n_s16(a0, 5);
-  a1 = vrshrq_n_s16(a1, 5);
-  a2 = vrshrq_n_s16(a2, 5);
-  a3 = vrshrq_n_s16(a3, 5);
-  a4 = vrshrq_n_s16(a4, 5);
-  a5 = vrshrq_n_s16(a5, 5);
-  a6 = vrshrq_n_s16(a6, 5);
-  a7 = vrshrq_n_s16(a7, 5);
-
-  d0 = vld1_u8(dst);
-  dst += stride;
-  d1 = vld1_u8(dst);
-  dst += stride;
-  d2 = vld1_u8(dst);
-  dst += stride;
-  d3 = vld1_u8(dst);
-  dst += stride;
-  d4 = vld1_u8(dst);
-  dst += stride;
-  d5 = vld1_u8(dst);
-  dst += stride;
-  d6 = vld1_u8(dst);
-  dst += stride;
-  d7 = vld1_u8(dst);
-
-  d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
-  d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
-  d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
-  d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
-  d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
-  d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
-  d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
-  d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
-
-  d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
-  d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
-  d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
-  d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
-  d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
-  d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
-  d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
-  d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
-
-  vst1_u8(dest, d0);
-  dest += stride;
-  vst1_u8(dest, d1);
-  dest += stride;
-  vst1_u8(dest, d2);
-  dest += stride;
-  vst1_u8(dest, d3);
-  dest += stride;
-  vst1_u8(dest, d4);
-  dest += stride;
-  vst1_u8(dest, d5);
-  dest += stride;
-  vst1_u8(dest, d6);
-  dest += stride;
-  vst1_u8(dest, d7);
-}
-
 void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const int16x8_t cospis = vld1q_s16(kCospi);
   const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
   const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-  int16x8_t a0 = load_tran_low_to_s16q(input);
-  int16x8_t a1 = load_tran_low_to_s16q(input + 8);
-  int16x8_t a2 = load_tran_low_to_s16q(input + 16);
-  int16x8_t a3 = load_tran_low_to_s16q(input + 24);
-  int16x8_t a4 = load_tran_low_to_s16q(input + 32);
-  int16x8_t a5 = load_tran_low_to_s16q(input + 40);
-  int16x8_t a6 = load_tran_low_to_s16q(input + 48);
-  int16x8_t a7 = load_tran_low_to_s16q(input + 56);
-
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
+  int16x8_t a[8];
+
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  a[2] = load_tran_low_to_s16q(input + 16);
+  a[3] = load_tran_low_to_s16q(input + 24);
+  a[4] = load_tran_low_to_s16q(input + 32);
+  a[5] = load_tran_low_to_s16q(input + 40);
+  a[6] = load_tran_low_to_s16q(input + 48);
+  a[7] = load_tran_low_to_s16q(input + 56);
+
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_add8x8_neon(a, dest, stride);
 }
 
 void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -111,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
   const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
   const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-  int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
-  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x4_t a[8];
+  int16x8_t b[8];
 
-  a0 = load_tran_low_to_s16d(input);
-  a1 = load_tran_low_to_s16d(input + 8);
-  a2 = load_tran_low_to_s16d(input + 16);
-  a3 = load_tran_low_to_s16d(input + 24);
+  a[0] = load_tran_low_to_s16d(input);
+  a[1] = load_tran_low_to_s16d(input + 8);
+  a[2] = load_tran_low_to_s16d(input + 16);
+  a[3] = load_tran_low_to_s16d(input + 24);
 
-  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
-                       &a5, &a6, &a7);
-  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
-                       a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-  add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
+  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
+  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
+  idct8x8_add8x8_neon(b, dest, stride);
 }
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index 6ed02af5a..c4d3b4711 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -78,6 +78,28 @@ static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
 
 //------------------------------------------------------------------------------
 
+static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) {
+  return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS),
+                      vrshrn_n_s32(in[1], DCT_CONST_BITS));
+}
+
+static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32,
+                                                    int16x8_t *const d0,
+                                                    int16x8_t *const d1) {
+  *d0 = dct_const_round_shift_low_8(t32 + 0);
+  *d1 = dct_const_round_shift_low_8(t32 + 2);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2(const int64x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1], DCT_CONST_BITS));
+  out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[3], DCT_CONST_BITS));
+  return out;
+}
+
 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                       const int16_t a_const) {
@@ -102,24 +124,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
   // input) this function can not use vaddq_s16.
   // In order to match existing behavior and intentionally out of range tests,
   // expand the addition up to 32 bits to prevent truncation.
-  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 // Subtract b from a, then multiply by ab_const. Shift and narrow by
 // DCT_CONST_BITS.
 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
-  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
@@ -127,12 +149,12 @@ static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
     const int16x8_t a, const int16_t a_const, const int16x8_t b,
     const int16_t b_const) {
-  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
-  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
-  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
-  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vmull_n_s16(vget_low_s16(a), a_const);
+  t[1] = vmull_n_s16(vget_high_s16(a), a_const);
+  t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const);
+  t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 //------------------------------------------------------------------------------
@@ -145,53 +167,43 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
 static INLINE int32x4x2_t
 multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
   int64x2_t b[4];
-  int32x4x2_t c;
+
   b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
   b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
   b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
   b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
-  c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(b[1], DCT_CONST_BITS));
-  c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(b[3], DCT_CONST_BITS));
-  return c;
+  return dct_const_round_shift_high_4x2(b);
 }
 
 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
 static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
-  const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]);
-  const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]);
+  int32x4_t t[2];
   int64x2_t c[4];
-  int32x4x2_t d;
-  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
-  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
-  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
-  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+
+  t[0] = vaddq_s32(a.val[0], b.val[0]);
+  t[1] = vaddq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Subtract b from a, then multiply by ab_const. Shift and narrow by
 // DCT_CONST_BITS.
 static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
-  const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]);
-  const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]);
+  int32x4_t t[2];
   int64x2_t c[4];
-  int32x4x2_t d;
-  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
-  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
-  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
-  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+
+  t[0] = vsubq_s32(a.val[0], b.val[0]);
+  t[1] = vsubq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
@@ -200,7 +212,6 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
     const int32_t b_const) {
   int64x2_t c[4];
-  int32x4x2_t d;
   c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
   c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
   c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
@@ -209,72 +220,66 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
   c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
   c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
   c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Shift the output down by 6 and add it to the destination buffer.
-static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
-                                        const int16x8_t a2, const int16x8_t a3,
-                                        const int16x8_t a4, const int16x8_t a5,
-                                        const int16x8_t a6, const int16x8_t a7,
-                                        uint8_t *b, const int b_stride) {
-  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
-  b0 = vld1_u8(b);
-  b += b_stride;
-  b1 = vld1_u8(b);
-  b += b_stride;
-  b2 = vld1_u8(b);
-  b += b_stride;
-  b3 = vld1_u8(b);
-  b += b_stride;
-  b4 = vld1_u8(b);
-  b += b_stride;
-  b5 = vld1_u8(b);
-  b += b_stride;
-  b6 = vld1_u8(b);
-  b += b_stride;
-  b7 = vld1_u8(b);
-  b -= (7 * b_stride);
+static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d,
+                                        const int stride) {
+  uint8x8_t b[8];
+  int16x8_t c[8];
+
+  b[0] = vld1_u8(d);
+  d += stride;
+  b[1] = vld1_u8(d);
+  d += stride;
+  b[2] = vld1_u8(d);
+  d += stride;
+  b[3] = vld1_u8(d);
+  d += stride;
+  b[4] = vld1_u8(d);
+  d += stride;
+  b[5] = vld1_u8(d);
+  d += stride;
+  b[6] = vld1_u8(d);
+  d += stride;
+  b[7] = vld1_u8(d);
+  d -= (7 * stride);
 
   // c = b + (a >> 6)
-  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
-  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
-  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
-  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
-  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
-  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
-  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
-  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
-
-  b0 = vqmovun_s16(c0);
-  b1 = vqmovun_s16(c1);
-  b2 = vqmovun_s16(c2);
-  b3 = vqmovun_s16(c3);
-  b4 = vqmovun_s16(c4);
-  b5 = vqmovun_s16(c5);
-  b6 = vqmovun_s16(c6);
-  b7 = vqmovun_s16(c7);
-
-  vst1_u8(b, b0);
-  b += b_stride;
-  vst1_u8(b, b1);
-  b += b_stride;
-  vst1_u8(b, b2);
-  b += b_stride;
-  vst1_u8(b, b3);
-  b += b_stride;
-  vst1_u8(b, b4);
-  b += b_stride;
-  vst1_u8(b, b5);
-  b += b_stride;
-  vst1_u8(b, b6);
-  b += b_stride;
-  vst1_u8(b, b7);
+  c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6);
+  c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6);
+  c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6);
+  c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6);
+  c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6);
+  c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6);
+  c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6);
+  c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6);
+
+  b[0] = vqmovun_s16(c[0]);
+  b[1] = vqmovun_s16(c[1]);
+  b[2] = vqmovun_s16(c[2]);
+  b[3] = vqmovun_s16(c[3]);
+  b[4] = vqmovun_s16(c[4]);
+  b[5] = vqmovun_s16(c[5]);
+  b[6] = vqmovun_s16(c[6]);
+  b[7] = vqmovun_s16(c[7]);
+
+  vst1_u8(d, b[0]);
+  d += stride;
+  vst1_u8(d, b[1]);
+  d += stride;
+  vst1_u8(d, b[2]);
+  d += stride;
+  vst1_u8(d, b[3]);
+  d += stride;
+  vst1_u8(d, b[4]);
+  d += stride;
+  vst1_u8(d, b[5]);
+  d += stride;
+  vst1_u8(d, b[6]);
+  d += stride;
+  vst1_u8(d, b[7]);
 }
 
 static INLINE uint8x16_t create_dcq(const int16_t dc) {
@@ -283,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) {
   return vdupq_n_u8((uint8_t)t);
 }
 
-static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
-                                         int16x8_t *const a0,
-                                         int16x8_t *const a1) {
-  int16x4_t b0, b1, b2, b3;
-  int32x4_t c0, c1, c2, c3;
-  int16x8_t d0, d1;
-
-  transpose_s16_4x4q(a0, a1);
-  b0 = vget_low_s16(*a0);
-  b1 = vget_high_s16(*a0);
-  b2 = vget_low_s16(*a1);
-  b3 = vget_high_s16(*a1);
-  c0 = vmull_lane_s16(b0, cospis, 2);
-  c2 = vmull_lane_s16(b1, cospis, 2);
-  c1 = vsubq_s32(c0, c2);
-  c0 = vaddq_s32(c0, c2);
-  c2 = vmull_lane_s16(b2, cospis, 3);
-  c3 = vmull_lane_s16(b2, cospis, 1);
-  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
-  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
-  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
-  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
-  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
-  d0 = vcombine_s16(b0, b1);
-  d1 = vcombine_s16(b3, b2);
-  *a0 = vaddq_s16(d0, d1);
-  *a1 = vsubq_s16(d0, d1);
-}
-
-static INLINE void idct8x8_12_pass1_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
-    int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
-    int16x4_t *const io6, int16x4_t *const io7) {
+static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) {
+  const int16x4_t cospis = vld1_s16(kCospi);
+  int16x4_t b[4];
+  int32x4_t c[4];
+  int16x8_t d[2];
+
+  b[0] = vget_low_s16(a[0]);
+  b[1] = vget_high_s16(a[0]);
+  b[2] = vget_low_s16(a[1]);
+  b[3] = vget_high_s16(a[1]);
+  c[0] = vmull_lane_s16(b[0], cospis, 2);
+  c[2] = vmull_lane_s16(b[1], cospis, 2);
+  c[1] = vsubq_s32(c[0], c[2]);
+  c[0] = vaddq_s32(c[0], c[2]);
+  c[3] = vmull_lane_s16(b[2], cospis, 3);
+  c[2] = vmull_lane_s16(b[2], cospis, 1);
+  c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1);
+  c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3);
+  dct_const_round_shift_low_8_dual(c, &d[0], &d[1]);
+  a[0] = vaddq_s16(d[0], d[1]);
+  a[1] = vsubq_s16(d[0], d[1]);
+}
+
+static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) {
+  transpose_s16_4x4q(&a[0], &a[1]);
+  idct4x4_16_kernel_bd8(a);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        int16x4_t *const io) {
   int16x4_t step1[8], step2[8];
   int32x4_t t32[2];
 
-  transpose_s16_4x4d(io0, io1, io2, io3);
+  transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]);
 
   // stage 1
-  step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
-  step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
-  step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
-  step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+  step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3);
+  step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2);
+  step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1);
+  step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0);
 
   // stage 2
-  step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
-  step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
-  step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+  step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2);
+  step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3);
+  step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1);
 
   step2[4] = vadd_s16(step1[4], step1[5]);
   step2[5] = vsub_s16(step1[4], step1[5]);
@@ -352,32 +354,27 @@ static INLINE void idct8x8_12_pass1_bd8(
   step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 
   // stage 4
-  *io0 = vadd_s16(step1[0], step2[7]);
-  *io1 = vadd_s16(step1[1], step1[6]);
-  *io2 = vadd_s16(step1[2], step1[5]);
-  *io3 = vadd_s16(step1[3], step2[4]);
-  *io4 = vsub_s16(step1[3], step2[4]);
-  *io5 = vsub_s16(step1[2], step1[5]);
-  *io6 = vsub_s16(step1[1], step1[6]);
-  *io7 = vsub_s16(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_12_pass2_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
-    const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
-    const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
-    int16x8_t *const output1, int16x8_t *const output2,
-    int16x8_t *const output3, int16x8_t *const output4,
-    int16x8_t *const output5, int16x8_t *const output6,
-    int16x8_t *const output7) {
+  io[0] = vadd_s16(step1[0], step2[7]);
+  io[1] = vadd_s16(step1[1], step1[6]);
+  io[2] = vadd_s16(step1[2], step1[5]);
+  io[3] = vadd_s16(step1[3], step2[4]);
+  io[4] = vsub_s16(step1[3], step2[4]);
+  io[5] = vsub_s16(step1[2], step1[5]);
+  io[6] = vsub_s16(step1[1], step1[6]);
+  io[7] = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        const int16x4_t *const input,
+                                        int16x8_t *const output) {
   int16x8_t in[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
 
-  transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
-                    input7, &in[0], &in[1], &in[2], &in[3]);
+  transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5],
+                    input[6], input[7], &in[0], &in[1], &in[2], &in[3]);
 
   // stage 1
   step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
@@ -407,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8(
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *output0 = vaddq_s16(step1[0], step2[7]);
-  *output1 = vaddq_s16(step1[1], step1[6]);
-  *output2 = vaddq_s16(step1[2], step1[5]);
-  *output3 = vaddq_s16(step1[3], step2[4]);
-  *output4 = vsubq_s16(step1[3], step2[4]);
-  *output5 = vsubq_s16(step1[2], step1[5]);
-  *output6 = vsubq_s16(step1[1], step1[6]);
-  *output7 = vsubq_s16(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
-                                     const int16x4_t cospis1,
-                                     int16x8_t *const io0, int16x8_t *const io1,
-                                     int16x8_t *const io2, int16x8_t *const io3,
-                                     int16x8_t *const io4, int16x8_t *const io5,
-                                     int16x8_t *const io6,
-                                     int16x8_t *const io7) {
-  int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
+  output[0] = vaddq_s16(step1[0], step2[7]);
+  output[1] = vaddq_s16(step1[1], step1[6]);
+  output[2] = vaddq_s16(step1[2], step1[5]);
+  output[3] = vaddq_s16(step1[3], step2[4]);
+  output[4] = vsubq_s16(step1[3], step2[4]);
+  output[5] = vsubq_s16(step1[2], step1[5]);
+  output[6] = vsubq_s16(step1[1], step1[6]);
+  output[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+                                            const int16x4_t cospis1,
+                                            int16x8_t *const io) {
+  int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
   int16x4_t step1l[4], step1h[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
-
-  transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
 
   // stage 1
-  input_1l = vget_low_s16(*io1);
-  input_1h = vget_high_s16(*io1);
-  input_3l = vget_low_s16(*io3);
-  input_3h = vget_high_s16(*io3);
-  input_5l = vget_low_s16(*io5);
-  input_5h = vget_high_s16(*io5);
-  input_7l = vget_low_s16(*io7);
-  input_7h = vget_high_s16(*io7);
-  step1l[0] = vget_low_s16(*io0);
-  step1h[0] = vget_high_s16(*io0);
-  step1l[1] = vget_low_s16(*io2);
-  step1h[1] = vget_high_s16(*io2);
-  step1l[2] = vget_low_s16(*io4);
-  step1h[2] = vget_high_s16(*io4);
-  step1l[3] = vget_low_s16(*io6);
-  step1h[3] = vget_high_s16(*io6);
-
-  t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
-  t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
-  t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
-  t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
-  t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
-  t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
-  t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
-  t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
-  t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
-  t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
-  t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
-  t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
-  t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
-  t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
-  t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
-  t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
-  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
-  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
-  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
-  step1[4] = vcombine_s16(t16[0], t16[1]);
-  step1[5] = vcombine_s16(t16[2], t16[3]);
-  step1[6] = vcombine_s16(t16[4], t16[5]);
-  step1[7] = vcombine_s16(t16[6], t16[7]);
+  input1l = vget_low_s16(io[1]);
+  input1h = vget_high_s16(io[1]);
+  input3l = vget_low_s16(io[3]);
+  input3h = vget_high_s16(io[3]);
+  input5l = vget_low_s16(io[5]);
+  input5h = vget_high_s16(io[5]);
+  input7l = vget_low_s16(io[7]);
+  input7h = vget_high_s16(io[7]);
+  step1l[0] = vget_low_s16(io[0]);
+  step1h[0] = vget_high_s16(io[0]);
+  step1l[1] = vget_low_s16(io[2]);
+  step1h[1] = vget_high_s16(io[2]);
+  step1l[2] = vget_low_s16(io[4]);
+  step1h[2] = vget_high_s16(io[4]);
+  step1l[3] = vget_low_s16(io[6]);
+  step1h[3] = vget_high_s16(io[6]);
+
+  t32[0] = vmull_lane_s16(input1l, cospis1, 3);
+  t32[1] = vmull_lane_s16(input1h, cospis1, 3);
+  t32[2] = vmull_lane_s16(input3l, cospis1, 2);
+  t32[3] = vmull_lane_s16(input3h, cospis1, 2);
+  t32[4] = vmull_lane_s16(input3l, cospis1, 1);
+  t32[5] = vmull_lane_s16(input3h, cospis1, 1);
+  t32[6] = vmull_lane_s16(input1l, cospis1, 0);
+  t32[7] = vmull_lane_s16(input1h, cospis1, 0);
+  t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0);
+  t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1);
+  t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1);
+  t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2);
+  t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2);
+  t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3);
+  t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3);
+  dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]);
 
   // stage 2
   t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
@@ -503,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
-  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
-  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
-  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
-  step2[0] = vcombine_s16(t16[0], t16[1]);
-  step2[1] = vcombine_s16(t16[2], t16[3]);
-  step2[2] = vcombine_s16(t16[4], t16[5]);
-  step2[3] = vcombine_s16(t16[6], t16[7]);
+  dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]);
 
   step2[4] = vaddq_s16(step1[4], step1[5]);
   step2[5] = vsubq_s16(step1[4], step1[5]);
@@ -533,35 +498,25 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *io0 = vaddq_s16(step1[0], step2[7]);
-  *io1 = vaddq_s16(step1[1], step1[6]);
-  *io2 = vaddq_s16(step1[2], step1[5]);
-  *io3 = vaddq_s16(step1[3], step2[4]);
-  *io4 = vsubq_s16(step1[3], step2[4]);
-  *io5 = vsubq_s16(step1[2], step1[5]);
-  *io6 = vsubq_s16(step1[1], step1[6]);
-  *io7 = vsubq_s16(step1[0], step2[7]);
+  io[0] = vaddq_s16(step1[0], step2[7]);
+  io[1] = vaddq_s16(step1[1], step1[6]);
+  io[2] = vaddq_s16(step1[2], step1[5]);
+  io[3] = vaddq_s16(step1[3], step2[4]);
+  io[4] = vsubq_s16(step1[3], step2[4]);
+  io[5] = vsubq_s16(step1[2], step1[5]);
+  io[6] = vsubq_s16(step1[1], step1[6]);
+  io[7] = vsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
-                                              int16x8_t *const d0,
-                                              int16x8_t *const d1) {
-  int16x4_t t16[4];
-
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  *d0 = vcombine_s16(t16[0], t16[1]);
-  *d1 = vcombine_s16(t16[2], t16[3]);
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+                                     const int16x4_t cospis1,
+                                     int16x8_t *const io) {
+  transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+                    &io[7]);
+  idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
 }
 
 static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
@@ -584,7 +539,7 @@ static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
   int32x4_t t32[4];
 
   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
@@ -596,7 +551,7 @@ static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
   t32[2] = vnegq_s32(t32[2]);
   t32[3] = vnegq_s32(t32[3]);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
@@ -611,7 +566,7 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
   t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
   t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
@@ -627,7 +582,7 @@ static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
@@ -643,7 +598,7 @@ static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
@@ -659,7 +614,7 @@ static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
@@ -675,7 +630,7 @@ static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
@@ -691,7 +646,7 @@ static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
@@ -707,7 +662,7 @@ static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
@@ -786,73 +741,94 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
   vst1q_s16(output, out[15]);
 }
 
-static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
-                                    const int stride) {
-  uint8x8_t d = vld1_u8(*dest);
-  uint16x8_t q;
-
-  res = vrshrq_n_s16(res, 6);
-  q = vaddw_u8(vreinterpretq_u16_s16(res), d);
-  d = vqmovun_s16(vreinterpretq_s16_u16(q));
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+                                  const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 5);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
   vst1_u8(*dest, d);
   *dest += stride;
 }
 
-static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
-                                           uint16_t **dest, const int stride) {
-  uint16x8_t d = vld1q_u16(*dest);
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+                                       const int stride) {
+  idct8x8_add8x1(out[0], &dest, stride);
+  idct8x8_add8x1(out[1], &dest, stride);
+  idct8x8_add8x1(out[2], &dest, stride);
+  idct8x8_add8x1(out[3], &dest, stride);
+  idct8x8_add8x1(out[4], &dest, stride);
+  idct8x8_add8x1(out[5], &dest, stride);
+  idct8x8_add8x1(out[6], &dest, stride);
+  idct8x8_add8x1(out[7], &dest, stride);
+}
 
-  res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
-  res = vminq_s16(res, max);
-  d = vqshluq_n_s16(res, 0);
+static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
+                                    const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 6);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_add8x1(const int16x8_t a,
+                                           const int16x8_t max,
+                                           uint16_t **const dest,
+                                           const int stride) {
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s));
+  const int16x8_t res1 = vminq_s16(res0, max);
+  const uint16x8_t d = vqshluq_n_s16(res1, 0);
   vst1q_u16(*dest, d);
   *dest += stride;
 }
 
-static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest,
+static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a,
+                                               uint16_t **const dest,
                                                const int stride) {
-  uint16x8_t d = vld1q_u16(*dest);
-
-  res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6);
-  d = vmovl_u8(vqmovun_s16(res));
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6);
+  const uint16x8_t d = vmovl_u8(vqmovun_s16(res));
   vst1q_u16(*dest, d);
   *dest += stride;
 }
 
 static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
-                                            uint16_t *out, const int b_stride) {
-  highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride);
+                                            uint16_t *out, const int stride) {
+  highbd_idct16x16_add8x1_bd8(a[0], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[1], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[2], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[3], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[4], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[5], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[6], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[7], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[8], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[9], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[10], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[11], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[12], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[13], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[14], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[15], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[16], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[17], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[18], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[19], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[20], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[21], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[22], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[23], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[24], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[25], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[26], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[27], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[28], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[29], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[30], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
 }
 
 static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index fb1fa6b68..38e275834 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -667,8 +667,6 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
 }
 
-// -----------------------------------------------------------------------------
-
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 4efad5333..ea0962954 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -19,6 +19,13 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
+                                          const int16_t c2, const int16_t c3) {
+  return vcreate_s16((uint16_t)c0 | ((uint16_t)c1 << 16) |
+                     ((int64_t)(uint16_t)c2 << 32) |
+                     ((int64_t)(uint16_t)c3 << 48));
+}
+
 // Helper functions used to load tran_low_t into int16, narrowing if necessary.
 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h
index 6ee2a5863..16272ae3a 100644
--- a/vpx_dsp/bitreader.h
+++ b/vpx_dsp/bitreader.h
@@ -94,7 +94,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   }
 
   {
-    register int shift = vpx_norm[range];
+    const int shift = vpx_norm[range];
     range <<= shift;
     value <<= shift;
     count -= shift;
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h
index 41040cf93..e63092a1a 100644
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -35,7 +35,7 @@ static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   int count = br->count;
   unsigned int range = br->range;
   unsigned int lowvalue = br->lowvalue;
-  register int shift;
+  int shift;
 
   split = 1 + (((range - 1) * probability) >> 8);
 
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 13137659f..e86fd3996 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
 #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
 #if CONFIG_VP9_HIGHBITDEPTH
 #define HIGHBD_WRAPLOW(x, bd) \
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index aafa272fb..9ef04836a 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -14,38 +14,37 @@
 
 extern const int16_t vpx_rv[];
 
-#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
-                                out1, out2, out3, out4, out5, out6, out7,      \
-                                out8, out9, out10, out11, out12, out13, out14, \
-                                out15)                                         \
-  {                                                                            \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
-                                                                               \
-    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
-    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
-    out0 = (v16u8)temp6;                                                       \
-    out2 = (v16u8)temp7;                                                       \
-    out4 = (v16u8)temp8;                                                       \
-    out6 = (v16u8)temp9;                                                       \
-    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
-    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
-    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
-    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
-    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
-    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
+#define VPX_TRANSPOSE8x16_UB_UB(                                            \
+    in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4,   \
+    out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+  {                                                                         \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                \
+                                                                            \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                                 \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                                \
+    out0 = (v16u8)temp6;                                                    \
+    out2 = (v16u8)temp7;                                                    \
+    out4 = (v16u8)temp8;                                                    \
+    out6 = (v16u8)temp9;                                                    \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                   \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                   \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                   \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                   \
   }
 
 #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c
index d43a9fd18..f095cb0a4 100644
--- a/vpx_dsp/ppc/inv_txfm_vsx.c
+++ b/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -109,6 +109,7 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
 
 void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
                             int stride) {
+  int i, j;
   int32x4_t temp1, temp2, temp3, temp4;
   int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
   uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
@@ -152,8 +153,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   output_v = vec_packsu(tmp16_0, tmp16_1);
 
   vec_vsx_st(output_v, 0, tmp_dest);
-  for (int i = 0; i < 4; i++)
-    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
 }
 
 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3b1a873cd..a4a6fa084 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -242,6 +242,7 @@ DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct_neon.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index f9abaecf2..fd28d0d55 100644
--- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -778,6 +778,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
+
           // Interleave to do the multiply by constants which gets us
           // into 32 bits.
           {
@@ -834,6 +835,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
               return;
             }
 #endif  // DCT_HIGH_BIT_DEPTH
+
             // Interleave to do the multiply by constants which gets us
             // into 32 bits.
             {
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index 7e75d5d10..ef94522a3 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -192,8 +192,6 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
 // -----------------------------------------------------------------------------
 // Horizontal and vertical filtering
 
-#define CONV8_ROUNDING_BITS (7)
-
 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
@@ -210,6 +208,8 @@ static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
 
 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
 
+#define CONV8_ROUNDING_BITS (7)
+
 // -----------------------------------------------------------------------------
 // Horizontal Filtering
 
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
index de097c66a..7898ee12c 100644
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
   out[15] = in[15];
 }
 
-static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
 
   // stage 2
@@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       in = all[i];
       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
-      highbd_idct16_4col(in);
+      vpx_highbd_idct16_4col_sse4_1(in);
       input += 4 * 16;
     }
 
@@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       transpose_32bit_4x4(all[1] + i, out + 4);
       transpose_32bit_4x4(all[2] + i, out + 8);
       transpose_32bit_4x4(all[3] + i, out + 12);
-      highbd_idct16_4col(out);
+      vpx_highbd_idct16_4col_sse4_1(out);
 
       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 909a6b794..bb7a510e1 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
index ae391b2c0..8b2e3d241 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -17,7 +17,7 @@
 #include "vpx_dsp/x86/inv_txfm_ssse3.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static void highbd_idct8x8_half1d(__m128i *const io) {
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
   __m128i step1[8], step2[8];
 
   transpose_32bit_4x4x2(io, io);
@@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
 
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
     io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
@@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
     io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
     io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     temp[0] = io[4];
     temp[1] = io[5];
@@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[5] = io[9];
     io[6] = io[10];
     io[7] = io[11];
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = temp[0];
     io[9] = temp[1];
     io[10] = temp[2];
     io[11] = temp[3];
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     highbd_idct8x8_final_round(io);
   }
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index 435934f1b..5a7fd1d39 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -106,4 +106,7 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
   io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
 }
 
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index d9a6932e0..e1f9657df 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -91,7 +91,7 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
@@ -99,19 +99,20 @@ SECTION .text
                                       sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse, \
+                                        g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -130,8 +131,9 @@ SECTION .text
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse, \
+                                    g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ; Store bilin_filter and pw_8 location in stack
@@ -150,22 +152,16 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -284,14 +280,14 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -308,7 +304,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -511,14 +507,14 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -535,7 +531,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -633,14 +629,14 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -657,7 +653,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -732,14 +728,14 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -756,7 +752,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -859,8 +855,8 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
@@ -869,7 +865,7 @@ SECTION .text
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -897,7 +893,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 6b1837df5..4b02da966 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -165,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    idct8_sse2(in);
+    vpx_idct8_sse2(in);
   }
 
   write_buffer_8x8(in, dest, stride);
@@ -221,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   recon_and_store_8_dual(dest, dc_value, stride);
 }
 
-void idct8_sse2(__m128i *const in) {
+void vpx_idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
   transpose_16bit_8x8(in, in);
 
@@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-static void iadst16_8col(__m128i *const in) {
+void vpx_iadst16_8col_sse2(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) {
 
 void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
   transpose_16bit_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
+  vpx_iadst16_8col_sse2(in0);
+  vpx_iadst16_8col_sse2(in1);
 }
 
 // Group the coefficient calculation into smaller functions to prevent stack
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index 5cd5098f1..d573f66c9 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -697,10 +697,11 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
 }
 
 void idct4_sse2(__m128i *const in);
-void idct8_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
 void idct16_sse2(__m128i *const in0, __m128i *const in1);
 void iadst4_sse2(__m128i *const in);
 void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
 void iadst16_sse2(__m128i *const in0, __m128i *const in1);
 void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
 void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
diff --git a/vpx_dsp/x86/quantize_x86.h b/vpx_dsp/x86/quantize_x86.h
index 34928fbb5..0e07a2ac5 100644
--- a/vpx_dsp/x86/quantize_x86.h
+++ b/vpx_dsp/x86/quantize_x86.h
@@ -12,7 +12,6 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index cee4468c1..d938c1da4 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -114,27 +114,26 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
     %define sec_str sec_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse, \
+                                          g_bilin_filter, g_pw_8
       %define block_height dword heightm
       %define sec_str sec_stridemp
 
@@ -152,9 +151,9 @@ SECTION .text
 
       LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse, g_bilin_filter, g_pw_8
       %define block_height heightd
 
       ;Store bilin_filter and pw_8 location in stack
@@ -173,25 +172,18 @@ SECTION .text
     %endif
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-      %endif
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -371,8 +363,8 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -380,7 +372,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -397,7 +389,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -694,8 +686,8 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -703,7 +695,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -720,7 +712,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -852,8 +844,8 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -861,7 +853,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -878,7 +870,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -994,8 +986,8 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -1003,7 +995,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -1020,7 +1012,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1192,8 +1184,8 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
@@ -1206,7 +1198,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -1234,7 +1226,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index 343f27577..b0608591f 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h
@@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
 /* clang-format on */
-
 #endif  // VPX_PORTS_MEM_OPS_H_
diff --git a/vpx_util/vpx_atomics.h b/vpx_util/vpx_atomics.h
index b8cf80dae..504691065 100644
--- a/vpx_util/vpx_atomics.h
+++ b/vpx_util/vpx_atomics.h
@@ -68,7 +68,9 @@ extern "C" {
 // on any platform (to discourage programmer errors by setting values directly).
 // This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
 // (NOT memset) and accessed through vpx_atomic_ functions.
-typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int;
+typedef struct vpx_atomic_int {
+  volatile int value;
+} vpx_atomic_int;
 
 #define VPX_ATOMIC_INIT(num) \
   { num }
diff --git a/vpxenc.c b/vpxenc.c
index 9bd4792eb..144e60ccf 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -2215,9 +2215,9 @@ int main(int argc, const char **argv_) {
 
     if (!global.quiet) {
       FOREACH_STREAM(fprintf(
-          stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
-                  "b/f %7" PRId64 "b/s"
-                  " %7" PRId64 " %s (%.2f fps)\033[K\n",
+          stderr,
+          "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64
+          "b/s %7" PRId64 " %s (%.2f fps)\033[K\n",
           pass + 1, global.passes, frames_in, stream->frames_out,
           (int64_t)stream->nbytes,
           seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
diff --git a/y4minput.c b/y4minput.c
index e2df7a299..bf349c059 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -139,7 +139,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   Conversions which require both horizontal and vertical filtering could
    have these steps pipelined, for less memory consumption and better cache
    performance, but we do them separately for simplicity.*/
-
 #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
 #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
 #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))