35 files changed, 2530 insertions, 1060 deletions
diff --git a/.gitignore b/.gitignore
index 4337a2c32..bf5ffc713 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,8 +39,7 @@
 /examples/vp8cx_set_ref
 /examples/vp9cx_set_ref
 /examples/vp9_lossless_encoder
-/examples/vp9_spatial_scalable_encoder
-/examples/vpx_temporal_scalable_patterns
+/examples/vp9_spatial_svc_encoder
 /examples/vpx_temporal_svc_encoder
 /ivfdec
 /ivfdec.dox
@@ -51,6 +50,9 @@
 /samples.dox
 /test_intra_pred_speed
 /test_libvpx
+/tools.dox
+/tools/*.dox
+/tools/tiny_ssim
 /vp8_api1_migration.dox
 /vp[89x]_rtcd.h
 /vpx.pc
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 36120170e..09bdc5d2f 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -71,7 +71,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
-  include $(CONFIG_DIR)libs-armv8-android-gcc.mk
+  include $(CONFIG_DIR)libs-arm64-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq ($(TARGET_ARCH_ABI),x86)
   include $(CONFIG_DIR)libs-x86-android-gcc.mk
@@ -101,8 +101,8 @@ LOCAL_CFLAGS := -O3
 # like x86inc.asm and x86_abi_support.asm
 LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
-.PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
+.PRECIOUS: %.asm.S
+$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm
 	@mkdir -p $(dir $@)
 	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
@@ -132,7 +132,7 @@ endif
 
 # Pull out assembly files, splitting NEON from the rest.  This is
 # done to specify that the NEON assembly files use NEON assembler flags.
-# x86 assembly matches %.asm, arm matches %.asm.s
+# x86 assembly matches %.asm, arm matches %.asm.S
 
 # x86:
 
@@ -140,12 +140,12 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE))
 LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file))
 
 # arm:
-CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE))
+CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE))
 CODEC_SRCS_ASM_ARM = $(foreach v, \
                      $(CODEC_SRCS_ASM_ARM_ALL), \
                      $(if $(findstring neon,$(v)),,$(v)))
-CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \
-                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \
+                         $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                          $(CODEC_SRCS_ASM_ARM))
 LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS)
 
@@ -153,18 +153,19 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   CODEC_SRCS_ASM_NEON = $(foreach v, \
                         $(CODEC_SRCS_ASM_ARM_ALL),\
                         $(if $(findstring neon,$(v)),$(v),))
-  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \
-                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \
+  CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \
+                                $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \
                                 $(CODEC_SRCS_ASM_NEON))
-  LOCAL_SRC_FILES += $(patsubst %.s, \
-                     %.s.neon, \
+  LOCAL_SRC_FILES += $(patsubst %.S, \
+                     %.S.neon, \
                      $(CODEC_SRCS_ASM_NEON_ADS2GAS))
 endif
 
 LOCAL_CFLAGS += \
     -DHAVE_CONFIG_H=vpx_config.h \
     -I$(LIBVPX_PATH) \
-    -I$(ASM_CNV_PATH)
+    -I$(ASM_CNV_PATH) \
+    -I$(ASM_CNV_PATH)/libvpx
 
 LOCAL_MODULE := libvpx
 
@@ -185,7 +186,8 @@ endif
 $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h
 $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h
 
-ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),)
+rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a
+ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),)
 $$(rtcd_dep_template_SRCS): vpx_config.asm
 endif
 endef
diff --git a/build/make/Makefile b/build/make/Makefile
index 469eb74c3..cba605786 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -90,7 +90,7 @@ all:
 
 .PHONY: clean
 clean::
-	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s)
+	rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S)
 	rm -f $(CLEAN-OBJS)
 
 .PHONY: clean
@@ -180,13 +180,13 @@ $(BUILD_PFX)%.asm.o: %.asm
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
-$(BUILD_PFX)%.s.d: %.s
+$(BUILD_PFX)%.S.d: %.S
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \
             --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@
 
-$(BUILD_PFX)%.s.o: %.s
+$(BUILD_PFX)%.S.o: %.S
 	$(if $(quiet),@echo "    [AS] $@")
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
@@ -198,8 +198,8 @@ $(BUILD_PFX)%.c.S: %.c
 	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
 
-.PRECIOUS: %.asm.s
-$(BUILD_PFX)%.asm.s: %.asm
+.PRECIOUS: %.asm.S
+$(BUILD_PFX)%.asm.S: %.asm
 	$(if $(quiet),@echo "    [ASM CONVERSION] $@")
 	$(qexec)mkdir -p $(dir $@)
 	$(qexec)$(ASM_CONVERSION) <$< >$@
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 35609e89a..f050fa06a 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -635,7 +635,7 @@ setup_gnu_toolchain() {
   AS=${AS:-${CROSS}as}
   STRIP=${STRIP:-${CROSS}strip}
   NM=${NM:-${CROSS}nm}
-  AS_SFX=.s
+  AS_SFX=.S
   EXE_SFX=
 }
 
@@ -926,7 +926,7 @@ EOF
           ;;
         vs*)
           asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.s
+          AS_SFX=.S
           msvs_arch_dir=arm-msvs
           disable_feature multithread
           disable_feature unit_tests
@@ -1034,7 +1034,7 @@ EOF
           STRIP="$(${XCRUN_FIND} strip)"
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
-          AS_SFX=.s
+          AS_SFX=.S
           LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index e98611d10..e3395afa2 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -203,7 +203,7 @@ for opt in "$@"; do
             # The paths in file_list are fixed outside of the loop.
             file_list[${#file_list[@]}]="$opt"
             case "$opt" in
-                 *.asm|*.s) uses_asm=true
+                 *.asm|*.[Ss]) uses_asm=true
                  ;;
             esac
         ;;
diff --git a/configure b/configure
index 7065dfef5..fb732acf3 100755
--- a/configure
+++ b/configure
@@ -22,6 +22,7 @@ show_help(){
 Advanced options:
   ${toggle_libs}                  libraries
   ${toggle_examples}              examples
+  ${toggle_tools}                 tools
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
@@ -155,7 +156,7 @@ all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
 # note that these should be in dependency order for now.
-all_targets="libs examples docs"
+all_targets="libs examples tools docs"
 
 # all targets available are enabled, by default.
 for t in ${all_targets}; do
@@ -331,6 +332,7 @@ CMDLINE_SELECT="
 
     libs
     examples
+    tools
     docs
     libc
     as
@@ -476,7 +478,7 @@ EOF
     #
     # Write makefiles for all enabled targets
     #
-    for tgt in libs examples docs solution; do
+    for tgt in libs examples tools docs solution; do
         tgt_fn="$tgt-$toolchain.mk"
 
         if enabled $tgt; then
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index cecdce080..fa2df7271 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -84,6 +84,8 @@ static const arg_def_t speed_arg =
     ARG_DEF("sp", "speed", 1, "speed configuration");
 static const arg_def_t aqmode_arg =
     ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
+static const arg_def_t bitrates_arg =
+    ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg,
 #endif
                                        &speed_arg,
                                        &rc_end_usage_arg,
+                                       &bitrates_arg,
                                        NULL };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_,
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
       snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
                string_options, arg.val);
+    } else if (arg_match(&arg, &bitrates_arg, argi)) {
+      snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
+               string_options, arg.val);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {
@@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc,
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
-      const int tlayer0 = sl * cfg->ts_number_layers;
       if (cfg->ts_number_layers == 1)
         rc->layer_framerate[layer] = framerate;
       else
@@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc,
                       cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
-        rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] /
-                                 rc->layer_framerate[tlayer0];
+        rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
+                               rc->layer_framerate[layer];
       }
       rc->layer_input_frames[layer] = 0;
       rc->layer_enc_frames[layer] = 0;
@@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
                                           vpx_codec_enc_cfg_t *cfg,
                                           int frame_cnt) {
   unsigned int sl, tl;
-  int tot_num_frames = 0;
   double perc_fluctuation = 0.0;
+  int tot_num_frames = 0;
   printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
   printf("Rate control layer stats for sl%d tl%d layer(s):\n\n",
          cfg->ss_number_layers, cfg->ts_number_layers);
   for (sl = 0; sl < cfg->ss_number_layers; ++sl) {
+    tot_num_frames = 0;
     for (tl = 0; tl < cfg->ts_number_layers; ++tl) {
       const int layer = sl * cfg->ts_number_layers + tl;
       const int num_dropped =
@@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
               ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer])
               : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] -
                  1);
-      if (!sl) tot_num_frames += rc->layer_input_frames[layer];
+      tot_num_frames += rc->layer_input_frames[layer];
       rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] *
                                           rc->layer_encoding_bitrate[layer] /
                                           tot_num_frames;
@@ -620,7 +626,7 @@ int main(int argc, const char **argv) {
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
   vpx_svc_ref_frame_config_t ref_frame_config;
-  int sl, tl;
+  unsigned int sl, tl;
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
@@ -695,6 +701,8 @@ int main(int argc, const char **argv) {
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+  if (svc_ctx.speed >= 5)
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
 
   // Encode frames
   while (!end_of_stream) {
@@ -730,7 +738,7 @@ int main(int argc, const char **argv) {
                         &ref_frame_config);
       // Keep track of input frames, to account for frame drops in rate control
       // stats/metrics.
-      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
         ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                 layer_id.temporal_layer_id];
       }
@@ -793,7 +801,7 @@ int main(int argc, const char **argv) {
                   rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
                   // Keep count of rate control stats per layer, for non-key
                   // frames.
-                  if (tl == layer_id.temporal_layer_id &&
+                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
                       !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
                     rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
                     rc.layer_avg_rate_mismatch[layer] +=
@@ -807,7 +815,7 @@ int main(int argc, const char **argv) {
               // Update for short-time encoding bitrate states, for moving
               // window of size rc->window, shifted by rc->window / 2.
               // Ignore first window segment, due to key frame.
-              if (frame_cnt > rc.window_size) {
+              if (frame_cnt > (unsigned int)rc.window_size) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
@@ -823,13 +831,14 @@ int main(int argc, const char **argv) {
               }
 
               // Second shifted window.
-              if (frame_cnt > rc.window_size + rc.window_size / 2) {
+              if (frame_cnt >
+                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
                 tl = layer_id.temporal_layer_id;
                 for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
                   sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
                 }
 
-                if (frame_cnt > 2 * rc.window_size &&
+                if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
                     frame_cnt % rc.window_size == 0) {
                   rc.window_count += 1;
                   rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
@@ -842,10 +851,11 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-
+          /*
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          */
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
diff --git a/libs.mk b/libs.mk
index 6e12b5404..f4f48cc16 100644
--- a/libs.mk
+++ b/libs.mk
@@ -12,7 +12,7 @@
 # ARM assembly files are written in RVCT-style. We use some make magic to
 # filter those files to allow GCC compilation
 ifeq ($(ARCH_ARM),yes)
-  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm)
 else
   ASM:=.asm
 endif
@@ -366,7 +366,7 @@ endif
 #
 # Add assembler dependencies for configuration.
 #
-$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %.S.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
 
 
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 12eaa80e7..2921e5ddf 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -188,6 +188,7 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
     "invalid-vp90-2-08-tile_1x8_frame_parallel.webm.ivf.s288_r01-05_b6-.ivf" },
   { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
   { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
+  { 2, "invalid-crbug-629481.webm" },
 };
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 9eb4d9dbb..0c704c5c8 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -41,13 +41,42 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
     partial_itxfm_ = GET_PARAM(2);
     tx_size_ = GET_PARAM(3);
     last_nonzero_ = GET_PARAM(4);
+
+    switch (tx_size_) {
+      case TX_4X4: size_ = 4; break;
+      case TX_8X8: size_ = 8; break;
+      case TX_16X16: size_ = 16; break;
+      case TX_32X32: size_ = 32; break;
+      default: FAIL() << "Wrong Size!"; break;
+    }
+    block_size_ = size_ * size_;
+
+    input_block_ = reinterpret_cast<tran_low_t *>(
+        vpx_memalign(16, sizeof(*input_block_) * block_size_));
+    output_block_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, sizeof(*output_block_) * block_size_));
+    output_block_ref_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, sizeof(*output_block_ref_) * block_size_));
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  virtual void TearDown() {
+    vpx_free(input_block_);
+    input_block_ = NULL;
+    vpx_free(output_block_);
+    output_block_ = NULL;
+    vpx_free(output_block_ref_);
+    output_block_ref_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
 
  protected:
   int last_nonzero_;
   TX_SIZE tx_size_;
+  tran_low_t *input_block_;
+  uint8_t *output_block_;
+  uint8_t *output_block_ref_;
+  int size_;
+  int block_size_;
   FwdTxfmFunc ftxfm_;
   InvTxfmFunc full_itxfm_;
   InvTxfmFunc partial_itxfm_;
@@ -55,95 +84,62 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
 
 TEST_P(PartialIDctTest, RunQuantCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
 
   const int count_test_block = 1000;
-  const int block_size = size * size;
 
   DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
   DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
 
-  int max_error = 0;
   for (int i = 0; i < count_test_block; ++i) {
     // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+    memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+    memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
     for (int i = 0; i < count_test_block; ++i) {
       // Initialize a test block with input range [-255, 255].
       if (i == 0) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = 255;
+        for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = 255;
       } else if (i == 1) {
-        for (int j = 0; j < block_size; ++j) input_extreme_block[j] = -255;
+        for (int j = 0; j < block_size_; ++j) input_extreme_block[j] = -255;
       } else {
-        for (int j = 0; j < block_size; ++j) {
+        for (int j = 0; j < block_size_; ++j) {
           input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
         }
       }
 
-      ftxfm_(input_extreme_block, output_ref_block, size);
+      ftxfm_(input_extreme_block, output_ref_block, size_);
 
       // quantization with maximum allowed step sizes
-      test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+      input_block_[0] = (output_ref_block[0] / 1336) * 1336;
       for (int j = 1; j < last_nonzero_; ++j) {
-        test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] =
+        input_block_[vp9_default_scan_orders[tx_size_].scan[j]] =
             (output_ref_block[j] / 1828) * 1828;
       }
     }
 
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+    ASM_REGISTER_STATE_CHECK(
+        full_itxfm_(input_block_, output_block_ref_, size_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_itxfm_(input_block_, output_block_, size_));
 
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
-    }
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        sizeof(*output_block_) * block_size_))
+        << "Error: partial inverse transform produces different results";
   }
-
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
 }
 
 TEST_P(PartialIDctTest, ResultsMatch) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  int size;
-  switch (tx_size_) {
-    case TX_4X4: size = 4; break;
-    case TX_8X8: size = 8; break;
-    case TX_16X16: size = 16; break;
-    case TX_32X32: size = 32; break;
-    default: FAIL() << "Wrong Size!"; break;
-  }
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, test_coef_block2[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst1[kMaxNumCoeffs]);
-  DECLARE_ALIGNED(16, uint8_t, dst2[kMaxNumCoeffs]);
   const int count_test_block = 1000;
   const int max_coeff = 32766 / 4;
-  const int block_size = size * size;
-  int max_error = 0;
   for (int i = 0; i < count_test_block; ++i) {
     // clear out destination buffer
-    memset(dst1, 0, sizeof(*dst1) * block_size);
-    memset(dst2, 0, sizeof(*dst2) * block_size);
-    memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
-    memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+    memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+    memset(output_block_, 0, sizeof(*output_block_) * block_size_);
+    memset(output_block_ref_, 0, sizeof(*output_block_ref_) * block_size_);
     int max_energy_leftover = max_coeff * max_coeff;
     for (int j = 0; j < last_nonzero_; ++j) {
       int16_t coef = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
@@ -153,24 +149,42 @@ TEST_P(PartialIDctTest, ResultsMatch) {
         max_energy_leftover = 0;
         coef = 0;
       }
-      test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = coef;
     }
 
-    memcpy(test_coef_block2, test_coef_block1,
-           sizeof(*test_coef_block2) * block_size);
+    ASM_REGISTER_STATE_CHECK(
+        full_itxfm_(input_block_, output_block_ref_, size_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_itxfm_(input_block_, output_block_, size_));
 
-    ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
-    ASM_REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block2, dst2, size));
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        sizeof(*output_block_) * block_size_))
+        << "Error: partial inverse transform produces different results";
+  }
+}
 
-    for (int j = 0; j < block_size; ++j) {
-      const int diff = dst1[j] - dst2[j];
-      const int error = diff * diff;
-      if (max_error < error) max_error = error;
+TEST_P(PartialIDctTest, AddOutputBlock) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 10;
+  for (int i = 0; i < count_test_block; ++i) {
+    memset(input_block_, 0, sizeof(*input_block_) * block_size_);
+    for (int j = 0; j < last_nonzero_; ++j) {
+      input_block_[vp9_default_scan_orders[tx_size_].scan[j]] = 10;
     }
-  }
 
-  EXPECT_EQ(0, max_error)
-      << "Error: partial inverse transform produces different results";
+    for (int j = 0; j < block_size_; ++j) {
+      output_block_[j] = output_block_ref_[j] = rnd.Rand8();
+    }
+
+    ASM_REGISTER_STATE_CHECK(
+        full_itxfm_(input_block_, output_block_ref_, size_));
+    ASM_REGISTER_STATE_CHECK(
+        partial_itxfm_(input_block_, output_block_, size_));
+
+    ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
+                        sizeof(*output_block_) * block_size_))
+        << "Error: Transform results are not correctly added to output.";
+  }
 }
 using std::tr1::make_tuple;
 
@@ -214,7 +228,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
                                  &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
 #else   // !CONFIG_VP9_HIGHBITDEPTH
-// 32x32_34_ 32x32_135_ are implemented using the 1024 version.
+// 32x32_135_ is implemented using the 1024 version.
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
@@ -222,7 +236,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
                                  &vpx_idct32x32_1024_add_neon, TX_32X32, 135),
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
-                                 &vpx_idct32x32_1024_add_neon, TX_32X32, 34),
+                                 &vpx_idct32x32_34_add_neon, TX_32X32, 34),
                       make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
                                  &vpx_idct32x32_1_add_neon, TX_32X32, 1),
                       make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 2e34fed06..4f6592647 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -7,22 +7,23 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*PostProcFunc)(unsigned char *src_ptr, unsigned char *dst_ptr,
-                             int src_pixels_per_line, int dst_pixels_per_line,
-                             int cols, unsigned char *flimit, int size);
+typedef void (*VpxPostProcDownAndAcrossMbRowFunc)(
+    unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line,
+    int dst_pixels_per_line, int cols, unsigned char *flimit, int size);
 
 namespace {
 
-class VPxPostProcessingFilterTest
-    : public ::testing::TestWithParam<PostProcFunc> {
+class VpxPostProcDownAndAcrossMbRowTest
+    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 };
@@ -30,7 +31,7 @@ class VPxPostProcessingFilterTest
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
-TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
   const int block_width = 16;
   const int block_height = 16;
@@ -78,14 +79,14 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
                                       input_stride, output_stride, block_width,
                                       flimits, 16));
 
-  static const uint8_t expected_data[block_height] = { 4, 3, 1, 1, 1, 1, 1, 1,
-                                                       1, 1, 1, 1, 1, 1, 3, 4 };
+  static const uint8_t kExpectedOutput[block_height] = {
+    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
+  };
 
   pixel_ptr = dst_image_ptr;
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
-      EXPECT_EQ(expected_data[i], pixel_ptr[j])
-          << "VPxPostProcessingFilterTest failed with invalid filter output";
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]);
     }
     pixel_ptr += output_stride;
   }
@@ -96,18 +97,18 @@ TEST_P(VPxPostProcessingFilterTest, FilterOutputCheck) {
 };
 
 INSTANTIATE_TEST_CASE_P(
-    C, VPxPostProcessingFilterTest,
+    C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VPxPostProcessingFilterTest,
+    SSE2, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2));
 #endif
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
-    MSA, VPxPostProcessingFilterTest,
+    MSA, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa));
 #endif
 
diff --git a/test/test-data.mk b/test/test-data.mk
index 80b802e0a..e528c9182 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -775,6 +775,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.iv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index b97ae967e..eda46c918 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -840,3 +840,5 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 787f04f0483320d536894282f3358a4f8cac1cf9 *invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 91d3cefd0deb98f3b0caf3a2d900ec7a7605e53a *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf
 1e472baaf5f6113459f0399a38a5a5e68d17799d *invalid-vp90-2-10-show-existing-frame.webm.ivf.s180315_r01-05_b6-.ivf.res
+70057835bf29d14e66699ce5f022df2551fb6b37 *invalid-crbug-629481.webm
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-crbug-629481.webm.res
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index a89630753..d8eb82426 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -29,6 +29,8 @@ namespace {
 typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
                             const uint8_t *above, const uint8_t *left);
 
+const int kBPS = 32;
+const int kTotalPixels = 32 * kBPS;
 const int kNumVp9IntraPredFuncs = 13;
 const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
   "DC_PRED",   "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED", "V_PRED",
@@ -36,107 +38,121 @@ const char *kVp9IntraPredNames[kNumVp9IntraPredFuncs] = {
   "D207_PRED", "D63_PRED",     "TM_PRED"
 };
 
+template <typename Pixel>
+struct IntraPredTestMem {
+  void Init(int block_size, int bd) {
+    libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
+    Pixel *const above = above_mem + 16;
+    const int mask = (1 << bd) - 1;
+    for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
+
+    // some code assumes the top row has been extended:
+    // d45/d63 C-code, for instance, but not the assembly.
+    // TODO(jzern): this style of extension isn't strictly necessary.
+    ASSERT_LE(block_size, kBPS);
+    for (int i = block_size; i < 2 * kBPS; ++i) {
+      above[i] = above[block_size - 1];
+    }
+  }
+
+  DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, left[kBPS]);
+  DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+typedef IntraPredTestMem<uint8_t> Vp9IntraPredTestMem;
+
+void CheckMd5Signature(const char name[], const char *const signatures[],
+                       const void *data, size_t data_size, int elapsed_time,
+                       int idx) {
+  libvpx_test::MD5 md5;
+  md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+  printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, kVp9IntraPredNames[idx],
+         elapsed_time, md5.Get());
+  EXPECT_STREQ(signatures[idx], md5.Get());
+}
+
 void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs,
-                   const char *const pred_func_names[], int num_funcs,
-                   const char *const signatures[], int block_size,
-                   int num_pixels_per_test) {
-  libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed());
-  const int kBPS = 32;
-  const int kTotalPixels = 32 * kBPS;
-  DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
-  DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
-  uint8_t *const above = above_mem + 16;
-  for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
-  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
-  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
-  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
-
-  // some code assumes the top row has been extended:
-  // d45/d63 C-code, for instance, but not the assembly.
-  // TODO(jzern): this style of extension isn't strictly necessary.
-  ASSERT_LE(block_size, kBPS);
-  memset(above + block_size, above[block_size - 1], 2 * kBPS - block_size);
-
-  for (int k = 0; k < num_funcs; ++k) {
+                   const char *const signatures[], int block_size) {
+  const int kNumTests = static_cast<int>(
+      2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+  Vp9IntraPredTestMem intra_pred_test_mem;
+  const uint8_t *const above = intra_pred_test_mem.above_mem + 16;
+
+  intra_pred_test_mem.Init(block_size, 8);
+
+  for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
     if (pred_funcs[k] == NULL) continue;
-    memcpy(src, ref_src, sizeof(src));
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
     vpx_usec_timer timer;
     vpx_usec_timer_start(&timer);
     for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
-      pred_funcs[k](src, kBPS, above, left);
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left);
     }
     libvpx_test::ClearSystemState();
     vpx_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
-    libvpx_test::MD5 md5;
-    md5.Add(src, sizeof(src));
-    printf("Mode %s[%12s]: %5d ms     MD5: %s\n", name, pred_func_names[k],
-           elapsed_time, md5.Get());
-    EXPECT_STREQ(signatures[k], md5.Get());
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
   }
 }
 
 void TestIntraPred4(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "4334156168b34ab599d9b5b30f522fe9", "bc4649d5ba47c7ff178d92e475960fb0",
-    "8d316e5933326dcac24e1064794b5d12", "a27270fed024eafd762c95de85f4da51",
-    "c33dff000d4256c2b8f3bf9e9bab14d2", "44d8cddc2ad8f79b8ed3306051722b4f",
-    "eb54839b2bad6699d8946f01ec041cd0", "ecb0d56ae5f677ea45127ce9d5c058e4",
-    "0b7936841f6813da818275944895b574", "9117972ef64f91a58ff73e1731c81db2",
-    "c56d5e8c729e46825f46dd5d3b5d508a", "c0889e2039bcf7bcb5d2f33cdca69adc",
-    "309a618577b27c648f9c5ee45252bc8f",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "e7ed7353c3383fff942e500e9bfe82fe", "2a4a26fcc6ce005eadc08354d196c8a9",
+    "269d92eff86f315d9c38fe7640d85b15", "ae2960eea9f71ee3dabe08b282ec1773",
+    "6c1abcc44e90148998b51acd11144e9c", "f7bb3186e1ef8a2b326037ff898cad8e",
+    "364c1f3fb2f445f935aec2a70a67eaa4", "141624072a4a56773f68fadbdd07c4a7",
+    "7be49b08687a5f24df3a2c612fca3876", "459bb5d9fd5b238348179c9a22108cd6",
+    "73edb8831bf1bdfce21ae8eaa43b1234", "2e2457f2009c701a355a8b25eb74fcda",
+    "52ae4e8bdbe41494c1f43051d4dd7f0b"
   };
-  TestIntraPred("Intra4", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 4, 4 * 4 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra4", pred_funcs, kSignatures, 4);
 }
 
 void TestIntraPred8(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "7694ddeeefed887faf9d339d18850928", "7d726b1213591b99f736be6dec65065b",
-    "19c5711281357a485591aaf9c96c0a67", "ba6b66877a089e71cd938e3b8c40caac",
-    "802440c93317e0f8ba93fab02ef74265", "9e09a47a15deb0b9d8372824f9805080",
-    "b7c2d8c662268c0c427da412d7b0311d", "78339c1c60bb1d67d248ab8c4da08b7f",
-    "5c97d70f7d47de1882a6cd86c165c8a9", "8182bf60688b42205acd95e59e967157",
-    "08323400005a297f16d7e57e7fe1eaac", "95f7bfc262329a5849eda66d8f7c68ce",
-    "815b75c8e0d91cc1ae766dc5d3e445a3",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "d8bbae5d6547cfc17e4f5f44c8730e88", "373bab6d931868d41a601d9d88ce9ac3",
+    "6fdd5ff4ff79656c14747598ca9e3706", "d9661c2811d6a73674f40ffb2b841847",
+    "7c722d10b19ccff0b8c171868e747385", "f81dd986eb2b50f750d3a7da716b7e27",
+    "d500f2c8fc78f46a4c74e4dcf51f14fb", "0e3523f9cab2142dd37fd07ec0760bce",
+    "79ac4efe907f0a0f1885d43066cfedee", "19ecf2432ac305057de3b6578474eec6",
+    "4f985b61acc6dd5d2d2585fa89ea2e2d", "f1bb25a9060dd262f405f15a38f5f674",
+    "209ea00801584829e9a0f7be7d4a74ba"
   };
-  TestIntraPred("Intra8", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 8, 8 * 8 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra8", pred_funcs, kSignatures, 8);
 }
 
 void TestIntraPred16(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "b40dbb555d5d16a043dc361e6694fe53", "fb08118cee3b6405d64c1fd68be878c6",
-    "6c190f341475c837cc38c2e566b64875", "db5c34ccbe2c7f595d9b08b0dc2c698c",
-    "a62cbfd153a1f0b9fed13e62b8408a7a", "143df5b4c89335e281103f610f5052e4",
-    "d87feb124107cdf2cfb147655aa0bb3c", "7841fae7d4d47b519322e6a03eeed9dc",
-    "f6ebed3f71cbcf8d6d0516ce87e11093", "3cc480297dbfeed01a1c2d78dd03d0c5",
-    "b9f69fa6532b372c545397dcb78ef311", "a8fe1c70432f09d0c20c67bdb6432c4d",
-    "b8a41aa968ec108af447af4217cba91b",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "50971c07ce26977d30298538fffec619", "527a6b9e0dc5b21b98cf276305432bef",
+    "7eff2868f80ebc2c43a4f367281d80f7", "67cd60512b54964ef6aff1bd4816d922",
+    "48371c87dc95c08a33b2048f89cf6468", "b0acf2872ee411d7530af6d2625a7084",
+    "f32aafed4d8d3776ed58bcb6188756d5", "dae208f3dca583529cff49b73f7c4183",
+    "7af66a2f4c8e0b4908e40f047e60c47c", "125e3ab6ab9bc961f183ec366a7afa88",
+    "6b90f25b23983c35386b9fd704427622", "f8d6b11d710edc136a7c62c917435f93",
+    "ed308f18614a362917f411c218aee532"
   };
-  TestIntraPred("Intra16", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 16, 16 * 16 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra16", pred_funcs, kSignatures, 16);
 }
 
 void TestIntraPred32(VpxPredFunc const *pred_funcs) {
-  static const int kNumVp9IntraFuncs = 13;
-  static const char *const kSignatures[kNumVp9IntraFuncs] = {
-    "558541656d84f9ae7896db655826febe", "b3587a1f9a01495fa38c8cd3c8e2a1bf",
-    "4c6501e64f25aacc55a2a16c7e8f0255", "b3b01379ba08916ef6b1b35f7d9ad51c",
-    "0f1eb38b6cbddb3d496199ef9f329071", "911c06efb9ed1c3b4c104b232b55812f",
-    "9225beb0ddfa7a1d24eaa1be430a6654", "0a6d584a44f8db9aa7ade2e2fdb9fc9e",
-    "b01c9076525216925f3456f034fb6eee", "d267e20ad9e5cd2915d1a47254d3d149",
-    "ed012a4a5da71f36c2393023184a0e59", "f162b51ed618d28b936974cff4391da5",
-    "9e1370c6d42e08d357d9612c93a71cfc",
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "a0a618c900e65ae521ccc8af789729f2", "985aaa7c72b4a6c2fb431d32100cf13a",
+    "10662d09febc3ca13ee4e700120daeb5", "b3b01379ba08916ef6b1b35f7d9ad51c",
+    "9f4261755795af97e34679c333ec7004", "bc2c9da91ad97ef0d1610fb0a9041657",
+    "75c79b1362ad18abfcdb1aa0aacfc21d", "4039bb7da0f6860090d3c57b5c85468f",
+    "b29fff7b61804e68383e3a609b33da58", "e1aa5e49067fd8dba66c2eb8d07b7a89",
+    "4e042822909c1c06d3b10a88281df1eb", "72eb9d9e0e67c93f4c66b70348e9fef7",
+    "a22d102bcb51ca798aac12ca4ae8f2e8"
   };
-  TestIntraPred("Intra32", pred_funcs, kVp9IntraPredNames, kNumVp9IntraFuncs,
-                kSignatures, 32, 32 * 32 * kNumVp9IntraFuncs);
+  TestIntraPred("Intra32", pred_funcs, kSignatures, 32);
 }
 
 }  // namespace
@@ -153,7 +169,6 @@ void TestIntraPred32(VpxPredFunc const *pred_funcs) {
   }
 
 // -----------------------------------------------------------------------------
-// 4x4
 
 INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_dc_left_predictor_4x4_c, vpx_dc_top_predictor_4x4_c,
@@ -163,47 +178,6 @@ INTRA_PRED_TEST(C, TestIntraPred4, vpx_dc_predictor_4x4_c,
                 vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
                 vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
-                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
-                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
-                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
-                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
-                vpx_tm_predictor_4x4_sse2)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
-                vpx_d63_predictor_4x4_ssse3, NULL)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
-                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
-                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
-                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_neon)
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
-                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
-                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
-                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_4x4_msa)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 8x8
-
 INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_dc_left_predictor_8x8_c, vpx_dc_top_predictor_8x8_c,
                 vpx_dc_128_predictor_8x8_c, vpx_v_predictor_8x8_c,
@@ -212,46 +186,6 @@ INTRA_PRED_TEST(C, TestIntraPred8, vpx_dc_predictor_8x8_c,
                 vpx_d153_predictor_8x8_c, vpx_d207_predictor_8x8_c,
                 vpx_d63_predictor_8x8_c, vpx_tm_predictor_8x8_c)
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
-                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
-                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
-                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
-#endif  // HAVE_SSE2
-
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
-                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
-#endif  // HAVE_SSSE3
-
-#if HAVE_DSPR2
-INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
-                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, vpx_tm_predictor_8x8_c)
-#endif  // HAVE_DSPR2
-
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
-                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
-                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
-                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
-                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
-
-#endif  // HAVE_NEON
-
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
-                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
-                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
-                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_8x8_msa)
-#endif  // HAVE_MSA
-
-// -----------------------------------------------------------------------------
-// 16x16
-
 INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
                 vpx_dc_left_predictor_16x16_c, vpx_dc_top_predictor_16x16_c,
                 vpx_dc_128_predictor_16x16_c, vpx_v_predictor_16x16_c,
@@ -260,87 +194,287 @@ INTRA_PRED_TEST(C, TestIntraPred16, vpx_dc_predictor_16x16_c,
                 vpx_d153_predictor_16x16_c, vpx_d207_predictor_16x16_c,
                 vpx_d63_predictor_16x16_c, vpx_tm_predictor_16x16_c)
 
+INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
+                vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
+                vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
+                vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
+                vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
+                vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
+                vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+
 #if HAVE_SSE2
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+                vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+                vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
+                vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL,
+                vpx_tm_predictor_4x4_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2,
+                vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2,
+                vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2,
+                vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2)
+
 INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2,
                 vpx_dc_left_predictor_16x16_sse2,
                 vpx_dc_top_predictor_16x16_sse2,
                 vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2,
                 vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_16x16_sse2)
+
+INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
+                vpx_dc_left_predictor_32x32_sse2,
+                vpx_dc_top_predictor_32x32_sse2,
+                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
+                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL,
+                vpx_d63_predictor_4x4_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_d153_predictor_8x8_ssse3,
+                vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL)
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_d45_predictor_16x16_ssse3, NULL, NULL,
                 vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3,
                 vpx_d63_predictor_16x16_ssse3, NULL)
+INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_d45_predictor_32x32_ssse3, NULL, NULL,
+                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
+                vpx_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_4x4_dspr2)
+INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL,
+                NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
+                NULL, NULL, vpx_tm_predictor_8x8_c)
 INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL,
                 NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL)
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
+INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
+                vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
+                vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
+                vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
+                vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
+                vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
+                vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
+                vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, NULL,
                 NULL, NULL, NULL, NULL, vpx_tm_predictor_16x16_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
+                vpx_dc_left_predictor_32x32_neon,
+                vpx_dc_top_predictor_32x32_neon,
+                vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
+                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa,
+                vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa,
+                vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa,
+                vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_4x4_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa,
+                vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa,
+                vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa,
+                vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_8x8_msa)
 INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa,
                 vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa,
                 vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa,
                 vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_16x16_msa)
+INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
+                vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
+                vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
+                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
+                vpx_tm_predictor_32x32_msa)
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
-// 32x32
 
-INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c,
-                vpx_dc_left_predictor_32x32_c, vpx_dc_top_predictor_32x32_c,
-                vpx_dc_128_predictor_32x32_c, vpx_v_predictor_32x32_c,
-                vpx_h_predictor_32x32_c, vpx_d45_predictor_32x32_c,
-                vpx_d135_predictor_32x32_c, vpx_d117_predictor_32x32_c,
-                vpx_d153_predictor_32x32_c, vpx_d207_predictor_32x32_c,
-                vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c)
+#if CONFIG_VP9_HIGHBITDEPTH
+namespace {
 
-#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2,
-                vpx_dc_left_predictor_32x32_sse2,
-                vpx_dc_top_predictor_32x32_sse2,
-                vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2,
-                vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_sse2)
-#endif  // HAVE_SSE2
+typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+                                  const uint16_t *above, const uint16_t *left,
+                                  int bd);
 
-#if HAVE_SSSE3
-INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_d45_predictor_32x32_ssse3, NULL, NULL,
-                vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3,
-                vpx_d63_predictor_32x32_ssse3, NULL)
-#endif  // HAVE_SSSE3
+typedef IntraPredTestMem<uint16_t> Vp9HighbdIntraPredTestMem;
 
-#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
-                vpx_dc_left_predictor_32x32_neon,
-                vpx_dc_top_predictor_32x32_neon,
-                vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
-                vpx_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_neon)
-#endif  // HAVE_NEON
+void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[], int block_size) {
+  const int kNumTests = static_cast<int>(
+      2.e10 / (block_size * block_size * kNumVp9IntraPredFuncs));
+  Vp9HighbdIntraPredTestMem intra_pred_test_mem;
+  const uint16_t *const above = intra_pred_test_mem.above_mem + 16;
 
-#if HAVE_MSA
-INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
-                vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa,
-                vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa,
-                vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                vpx_tm_predictor_32x32_msa)
-#endif  // HAVE_MSA
+  intra_pred_test_mem.Init(block_size, 12);
+
+  for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, kBPS, above,
+                    intra_pred_test_mem.left, 12);
+    }
+    libvpx_test::ClearSystemState();
+    vpx_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(name, signatures, intra_pred_test_mem.src,
+                      sizeof(intra_pred_test_mem.src), elapsed_time, k);
+  }
+}
+
+void TestHighbdIntraPred4(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "11f74af6c5737df472f3275cbde062fa", "51bea056b6447c93f6eb8f6b7e8f6f71",
+    "27e97f946766331795886f4de04c5594", "53ab15974b049111fb596c5168ec7e3f",
+    "f0b640bb176fbe4584cf3d32a9b0320a", "729783ca909e03afd4b47111c80d967b",
+    "fbf1c30793d9f32812e4d9f905d53530", "293fc903254a33754133314c6cdba81f",
+    "f8074d704233e73dfd35b458c6092374", "aa6363d08544a1ec4da33d7a0be5640d",
+    "462abcfdfa3d087bb33c9a88f2aec491", "863eab65d22550dd44a2397277c1ec71",
+    "23d61df1574d0fa308f9731811047c4b"
+  };
+  TestHighbdIntraPred("Intra4", pred_funcs, kSignatures, 4);
+}
+
+void TestHighbdIntraPred8(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "03da8829fe94663047fd108c5fcaa71d", "ecdb37b8120a2d3a4c706b016bd1bfd7",
+    "1d4543ed8d2b9368cb96898095fe8a75", "f791c9a67b913cbd82d9da8ecede30e2",
+    "065c70646f4dbaff913282f55a45a441", "51f87123616662ef7c35691497dfd0ba",
+    "2a5b0131ef4716f098ee65e6df01e3dd", "9ffe186a6bc7db95275f1bbddd6f7aba",
+    "a3258a2eae2e2bd55cb8f71351b22998", "8d909f0a2066e39b3216092c6289ece4",
+    "d183abb30b9f24c886a0517e991b22c7", "702a42fe4c7d665dc561b2aeeb60f311",
+    "7b5dbbbe7ae3a4ac2948731600bde5d6"
+  };
+  TestHighbdIntraPred("Intra8", pred_funcs, kSignatures, 8);
+}
+
+void TestHighbdIntraPred16(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "e33cb3f56a878e2fddb1b2fc51cdd275", "c7bff6f04b6052c8ab335d726dbbd52d",
+    "d0b0b47b654a9bcc5c6008110a44589b", "78f5da7b10b2b9ab39f114a33b6254e9",
+    "c78e31d23831abb40d6271a318fdd6f3", "90d1347f4ec9198a0320daecb6ff90b8",
+    "d2c623746cbb64a0c9e29c10f2c57041", "cf28bd387b81ad3e5f1a1c779a4b70a0",
+    "24c304330431ddeaf630f6ce94af2eac", "91a329798036bf64e8e00a87b131b8b1",
+    "d39111f22885307f920796a42084c872", "e2e702f7250ece98dd8f3f2854c31eeb",
+    "e2fb05b01eb8b88549e85641d8ce5b59"
+  };
+  TestHighbdIntraPred("Intra16", pred_funcs, kSignatures, 16);
+}
+
+void TestHighbdIntraPred32(VpxHighbdPredFunc const *pred_funcs) {
+  static const char *const kSignatures[kNumVp9IntraPredFuncs] = {
+    "a3e8056ba7e36628cce4917cd956fedd", "cc7d3024fe8748b512407edee045377e",
+    "2aab0a0f330a1d3e19b8ecb8f06387a3", "a547bc3fb7b06910bf3973122a426661",
+    "26f712514da95042f93d6e8dc8e431dc", "bb08c6e16177081daa3d936538dbc2e3",
+    "8f031af3e2650e89620d8d2c3a843d8b", "42867c8553285e94ee8e4df7abafbda8",
+    "6496bdee96100667833f546e1be3d640", "2ebfa25bf981377e682e580208504300",
+    "3e8ae52fd1f607f348aa4cb436c71ab7", "3d4efe797ca82193613696753ea624c4",
+    "cb8aab6d372278f3131e8d99efde02d9"
+  };
+  TestHighbdIntraPred("Intra32", pred_funcs, kSignatures, 32);
+}
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to |test_func|. The test name is 'arch.test_func', e.g., C.TestIntraPred4.
+#define HIGHBD_INTRA_PRED_TEST(arch, test_func, dc, dc_left, dc_top, dc_128,  \
+                               v, h, d45, d135, d117, d153, d207, d63, tm)    \
+  TEST(arch, test_func) {                                                     \
+    static const VpxHighbdPredFunc vpx_intra_pred[] = {                       \
+      dc, dc_left, dc_top, dc_128, v, h, d45, d135, d117, d153, d207, d63, tm \
+    };                                                                        \
+    test_func(vpx_intra_pred);                                                \
+  }
+
+// -----------------------------------------------------------------------------
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_c,
+    vpx_highbd_dc_left_predictor_4x4_c, vpx_highbd_dc_top_predictor_4x4_c,
+    vpx_highbd_dc_128_predictor_4x4_c, vpx_highbd_v_predictor_4x4_c,
+    vpx_highbd_h_predictor_4x4_c, vpx_highbd_d45_predictor_4x4_c,
+    vpx_highbd_d135_predictor_4x4_c, vpx_highbd_d117_predictor_4x4_c,
+    vpx_highbd_d153_predictor_4x4_c, vpx_highbd_d207_predictor_4x4_c,
+    vpx_highbd_d63_predictor_4x4_c, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_c,
+    vpx_highbd_dc_left_predictor_8x8_c, vpx_highbd_dc_top_predictor_8x8_c,
+    vpx_highbd_dc_128_predictor_8x8_c, vpx_highbd_v_predictor_8x8_c,
+    vpx_highbd_h_predictor_8x8_c, vpx_highbd_d45_predictor_8x8_c,
+    vpx_highbd_d135_predictor_8x8_c, vpx_highbd_d117_predictor_8x8_c,
+    vpx_highbd_d153_predictor_8x8_c, vpx_highbd_d207_predictor_8x8_c,
+    vpx_highbd_d63_predictor_8x8_c, vpx_highbd_tm_predictor_8x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_c,
+    vpx_highbd_dc_left_predictor_16x16_c, vpx_highbd_dc_top_predictor_16x16_c,
+    vpx_highbd_dc_128_predictor_16x16_c, vpx_highbd_v_predictor_16x16_c,
+    vpx_highbd_h_predictor_16x16_c, vpx_highbd_d45_predictor_16x16_c,
+    vpx_highbd_d135_predictor_16x16_c, vpx_highbd_d117_predictor_16x16_c,
+    vpx_highbd_d153_predictor_16x16_c, vpx_highbd_d207_predictor_16x16_c,
+    vpx_highbd_d63_predictor_16x16_c, vpx_highbd_tm_predictor_16x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_c,
+    vpx_highbd_dc_left_predictor_32x32_c, vpx_highbd_dc_top_predictor_32x32_c,
+    vpx_highbd_dc_128_predictor_32x32_c, vpx_highbd_v_predictor_32x32_c,
+    vpx_highbd_h_predictor_32x32_c, vpx_highbd_d45_predictor_32x32_c,
+    vpx_highbd_d135_predictor_32x32_c, vpx_highbd_d117_predictor_32x32_c,
+    vpx_highbd_d153_predictor_32x32_c, vpx_highbd_d207_predictor_32x32_c,
+    vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
+                       vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
+                       vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
+                       vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
+                       vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
+                       vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
+                       vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL,
+                       vpx_highbd_tm_predictor_16x16_sse2)
+
+HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
+                       vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
+                       vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL,
+                       vpx_highbd_tm_predictor_32x32_sse2)
+#endif  // HAVE_SSE2
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #include "test/test_libvpx.cc"
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 804dc8956..f89f852b6 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -65,6 +65,7 @@ class VPxEncoderThreadTest
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
         encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+        encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
       } else {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(VP9E_SET_AQ_MODE, 3);
@@ -127,7 +128,7 @@ VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest,
                           ::testing::Values(::libvpx_test::kTwoPassGood,
                                             ::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(1, 9),   // cpu_used
+                          ::testing::Range(0, 9),   // cpu_used
                           ::testing::Range(0, 3),   // tile_columns
                           ::testing::Range(2, 5));  // threads
 }  // namespace
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index 1c4a3392e..7cd32b990 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -28,25 +28,25 @@ using libvpx_test::ACMRandom;
 
 const int count_test_block = 100000;
 
-typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
-                          const uint16_t *above, const uint16_t *left, int bps);
+typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
 
-struct IntraPredFunc {
-  IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
-                int block_size_value = 0, int bit_depth_value = 0)
+struct IntraPredParam {
+  IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL,
+                 int block_size_value = 0, int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
         bit_depth(bit_depth_value) {}
 
-  IntraPred pred_fn;
-  IntraPred ref_fn;
+  IntraPredFunc pred_fn;
+  IntraPredFunc ref_fn;
   int block_size;
   int bit_depth;
 };
 
-class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
+template <typename Pixel, typename PredParam>
+class IntraPredTest : public ::testing::TestWithParam<PredParam> {
  public:
-  void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
-               uint16_t *ref_dst) {
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int block_size = params_.block_size;
     above_row_ = above_data + 16;
@@ -56,13 +56,16 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
       // Fill edges with random data, try first with saturated values.
-      for (int x = -1; x <= block_size * 2; x++) {
+      for (int x = -1; x < block_size; x++) {
         if (i == 0) {
           above_row_[x] = mask_;
         } else {
           above_row_[x] = rnd.Rand16() & mask_;
         }
       }
+      for (int x = block_size; x < 2 * block_size; x++) {
+        above_row_[x] = above_row_[block_size - 1];
+      }
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
@@ -78,17 +81,12 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
 
  protected:
   virtual void SetUp() {
-    params_ = GetParam();
+    params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
   }
 
-  void Predict() {
-    const int bit_depth = params_.bit_depth;
-    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
-        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
-  }
+  void Predict();
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
@@ -104,18 +102,223 @@ class VP9IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
     }
   }
 
-  uint16_t *above_row_;
-  uint16_t *left_col_;
-  uint16_t *dst_;
-  uint16_t *ref_dst_;
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
   ptrdiff_t stride_;
   int mask_;
 
-  IntraPredFunc params_;
+  PredParam params_;
 };
 
+template <>
+void IntraPredTest<uint8_t, IntraPredParam>::Predict() {
+  params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+  ASM_REGISTER_STATE_CHECK(
+      params_.pred_fn(dst_, stride_, above_row_, left_col_));
+}
+
+typedef IntraPredTest<uint8_t, IntraPredParam> VP9IntraPredTest;
+
 TEST_P(VP9IntraPredTest, IntraPredTests) {
   // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_8x8_sse2, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d207_predictor_4x4_sse2, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_4x4_sse2,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_sse2,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_sse2,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_sse2,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_sse2,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_sse2,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_sse2,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_sse2,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_sse2, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_sse2, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_sse2, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_sse2, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_sse2,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_sse2,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_sse2,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_sse2,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_sse2, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_sse2, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_sse2, &vpx_h_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_h_predictor_32x32_sse2, &vpx_h_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_tm_predictor_4x4_sse2, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_sse2, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_sse2, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_sse2, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_sse2, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_sse2, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_sse2, &vpx_v_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_v_predictor_32x32_sse2, &vpx_v_predictor_32x32_c,
+                       32, 8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3,
+                                     &vpx_d45_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d45_predictor_32x32_ssse3,
+                                     &vpx_d45_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d63_predictor_4x4_ssse3,
+                                     &vpx_d63_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_d63_predictor_8x8_ssse3,
+                                     &vpx_d63_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d63_predictor_16x16_ssse3,
+                                     &vpx_d63_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d63_predictor_32x32_ssse3,
+                                     &vpx_d63_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d153_predictor_4x4_ssse3,
+                                     &vpx_d153_predictor_4x4_c, 4, 8),
+                      IntraPredParam(&vpx_d153_predictor_8x8_ssse3,
+                                     &vpx_d153_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d153_predictor_16x16_ssse3,
+                                     &vpx_d153_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d153_predictor_32x32_ssse3,
+                                     &vpx_d153_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d207_predictor_8x8_ssse3,
+                                     &vpx_d207_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_d207_predictor_16x16_ssse3,
+                                     &vpx_d207_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d207_predictor_32x32_ssse3,
+                                     &vpx_d207_predictor_32x32_c, 32, 8)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9IntraPredTest,
+    ::testing::Values(
+        IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_8x8_neon, &vpx_d45_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d45_predictor_16x16_neon,
+                       &vpx_d45_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
+                       &vpx_dc_128_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
+                       &vpx_dc_128_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_128_predictor_16x16_neon,
+                       &vpx_dc_128_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_128_predictor_32x32_neon,
+                       &vpx_dc_128_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_left_predictor_4x4_neon,
+                       &vpx_dc_left_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_left_predictor_8x8_neon,
+                       &vpx_dc_left_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_left_predictor_16x16_neon,
+                       &vpx_dc_left_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_left_predictor_32x32_neon,
+                       &vpx_dc_left_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_dc_predictor_4x4_neon, &vpx_dc_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_8x8_neon, &vpx_dc_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_dc_predictor_16x16_neon, &vpx_dc_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_dc_predictor_32x32_neon, &vpx_dc_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_dc_top_predictor_4x4_neon,
+                       &vpx_dc_top_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_dc_top_predictor_8x8_neon,
+                       &vpx_dc_top_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_dc_top_predictor_16x16_neon,
+                       &vpx_dc_top_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_dc_top_predictor_32x32_neon,
+                       &vpx_dc_top_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_h_predictor_4x4_neon, &vpx_h_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_h_predictor_8x8_neon, &vpx_h_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_h_predictor_16x16_neon, &vpx_h_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_h_predictor_32x32_neon, &vpx_h_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_tm_predictor_4x4_neon, &vpx_tm_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_8x8_neon, &vpx_tm_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_tm_predictor_16x16_neon, &vpx_tm_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_tm_predictor_32x32_neon, &vpx_tm_predictor_32x32_c,
+                       32, 8),
+        IntraPredParam(&vpx_v_predictor_4x4_neon, &vpx_v_predictor_4x4_c, 4, 8),
+        IntraPredParam(&vpx_v_predictor_8x8_neon, &vpx_v_predictor_8x8_c, 8, 8),
+        IntraPredParam(&vpx_v_predictor_16x16_neon, &vpx_v_predictor_16x16_c,
+                       16, 8),
+        IntraPredParam(&vpx_v_predictor_32x32_neon, &vpx_v_predictor_32x32_c,
+                       32, 8)));
+#endif  // HAVE_NEON
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+struct HighbdIntraPredParam {
+  HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL,
+                       int block_size_value = 0, int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
+        bit_depth(bit_depth_value) {}
+
+  HighbdIntraPred pred_fn;
+  HighbdIntraPred ref_fn;
+  int block_size;
+  int bit_depth;
+};
+
+template <>
+void IntraPredTest<uint16_t, HighbdIntraPredParam>::Predict() {
+  const int bit_depth = params_.bit_depth;
+  params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+  ASM_REGISTER_STATE_CHECK(
+      params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+}
+
+typedef IntraPredTest<uint16_t, HighbdIntraPredParam> VP9HighbdIntraPredTest;
+
+TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
+  // max block size is 32
   DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
   DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
   DECLARE_ALIGNED(16, uint16_t, dst[3 * 32 * 32]);
@@ -124,88 +327,90 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
 }
 
 #if HAVE_SSE2
-#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_8, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 8),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 8),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 8),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
+    SSE2_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 8)));
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_10, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 10),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 10),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 10),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
+    SSE2_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 10)));
 
 INSTANTIATE_TEST_CASE_P(
-    SSE2_TO_C_12, VP9IntraPredTest,
-    ::testing::Values(IntraPredFunc(&vpx_highbd_dc_predictor_32x32_sse2,
-                                    &vpx_highbd_dc_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_16x16_sse2,
-                                    &vpx_highbd_tm_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_32x32_sse2,
-                                    &vpx_highbd_tm_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_4x4_sse2,
-                                    &vpx_highbd_dc_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_8x8_sse2,
-                                    &vpx_highbd_dc_predictor_8x8_c, 8, 12),
-                      IntraPredFunc(&vpx_highbd_dc_predictor_16x16_sse2,
-                                    &vpx_highbd_dc_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_4x4_sse2,
-                                    &vpx_highbd_v_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_8x8_sse2,
-                                    &vpx_highbd_v_predictor_8x8_c, 8, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_16x16_sse2,
-                                    &vpx_highbd_v_predictor_16x16_c, 16, 12),
-                      IntraPredFunc(&vpx_highbd_v_predictor_32x32_sse2,
-                                    &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_4x4_sse2,
-                                    &vpx_highbd_tm_predictor_4x4_c, 4, 12),
-                      IntraPredFunc(&vpx_highbd_tm_predictor_8x8_sse2,
-                                    &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
+    SSE2_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
+                             &vpx_highbd_dc_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
+                             &vpx_highbd_dc_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_16x16_sse2,
+                             &vpx_highbd_dc_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
+                             &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
+                             &vpx_highbd_tm_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
+                             &vpx_highbd_tm_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_sse2,
+                             &vpx_highbd_tm_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
+                             &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
+                             &vpx_highbd_v_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
+                             &vpx_highbd_v_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_16x16_sse2,
+                             &vpx_highbd_v_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2,
+                             &vpx_highbd_v_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSE2
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_SSE2
 }  // namespace
diff --git a/tools.mk b/tools.mk
new file mode 100644
index 000000000..3c660b1df
--- /dev/null
+++ b/tools.mk
@@ -0,0 +1,110 @@
+##
+##  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+# List of tools to build.
+TOOLS-yes            += tiny_ssim.c
+tiny_ssim.SRCS       += vpx/vpx_integer.h
+tiny_ssim.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
+tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
+
+#
+# End of specified files. The rest of the build rules should happen
+# automagically from here.
+#
+
+
+# Expand list of selected tools to build (as specified above)
+TOOLS           = $(addprefix tools/,$(call enabled,TOOLS))
+ALL_SRCS        = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS))
+
+
+# Expand all tools sources into a variable containing all sources
+# for that tools (not just them main one specified in TOOLS)
+# and add this file to the list (for MSVS workspace generation)
+$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk))
+
+
+# Create build/install dependencies for all tools. The common case
+# is handled here. The MSVS case is handled below.
+NOT_MSVS = $(if $(CONFIG_MSVS),,yes)
+DIST-BINS-$(NOT_MSVS)      += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX)))
+DIST-SRCS-yes              += $(ALL_SRCS)
+OBJS-$(NOT_MSVS)           += $(call objs,$(ALL_SRCS))
+BINS-$(NOT_MSVS)           += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX)))
+
+
+# Instantiate linker template for all tools.
+$(foreach bin,$(BINS-yes),\
+    $(eval $(bin):)\
+    $(eval $(call linker_template,$(bin),\
+        $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \
+				-lm\
+        )))
+
+
+# The following pairs define a mapping of locations in the distribution
+# tree to locations in the source/build trees.
+INSTALL_MAPS += src/%.c   %.c
+INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%
+INSTALL_MAPS += bin/%     %
+INSTALL_MAPS += %         %
+
+
+# Build Visual Studio Projects. We use a template here to instantiate
+# explicit rules rather than using an implicit rule because we want to
+# leverage make's VPATH searching rather than specifying the paths on
+# each file in TOOLS. This has the unfortunate side effect that
+# touching the source files trigger a rebuild of the project files
+# even though there is no real dependency there (the dependency is on
+# the makefiles). We may want to revisit this.
+define vcproj_template
+$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX)
+	$(if $(quiet),@echo "    [vcproj] $$@")
+	$(qexec)$$(GEN_VCPROJ)\
+            --exe\
+            --target=$$(TOOLCHAIN)\
+            --name=$$(@:.$(VCPROJ_SFX)=)\
+            --ver=$$(CONFIG_VS_VERSION)\
+            --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+            --src-path-bare="$(SRC_PATH_BARE)" \
+            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+            $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^
+endef
+TOOLS_BASENAME := $(notdir $(TOOLS))
+PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX))
+INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\
+                               $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe)))
+$(foreach proj,$(call enabled,PROJECTS),\
+    $(eval $(call vcproj_template,$(proj))))
+
+#
+# Documentation Rules
+#
+%.dox: %.c
+	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
+	@echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@
+	@echo "   \includelineno $(<F)" >> $@
+	@echo "*/" >> $@
+
+tools.dox: tools.mk
+	@echo "    [DOXY] $@"
+	@echo "/*!\page tools Tools" > $@
+	@echo "    This SDK includes a number of tools/utilities."\
+	      "The following tools are included: ">>$@
+	@$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\
+	   echo "     - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;)
+	@echo "*/" >> $@
+
+CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox)
+DOCS-yes += tools.doxy tools.dox
+tools.doxy: tools.dox $(TOOLS:.c=.dox)
+	@echo "INPUT += $^" > $@
diff --git a/tools/tiny_ssim.c b/tools/tiny_ssim.c
new file mode 100644
index 000000000..28052e0a8
--- /dev/null
+++ b/tools/tiny_ssim.c
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <errno.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vpx/vpx_integer.h"
+
+void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+
+  // scale the constants by number of pixels
+  c1 = (cc1 * count * count) >> 12;
+  c2 = (cc2 * count * count) >> 12;
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                       &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
+                                 int recon_stride, unsigned int cols,
+                                 unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
+
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
+    }
+
+    orig += orig_stride;
+    recon += recon_stride;
+  }
+
+  return total_sse;
+}
+
+#define MAX_PSNR 100
+
+double vp9_mse2psnr(double samples, double peak, double mse) {
+  double psnr;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+  else
+    psnr = MAX_PSNR;  // Limit to prevent / 0
+
+  if (psnr > MAX_PSNR) psnr = MAX_PSNR;
+
+  return psnr;
+}
+
+int main(int argc, char *argv[]) {
+  FILE *f[2];
+  uint8_t *buf[2];
+  int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0;
+  double ssim = 0, psnravg = 0, psnrglb = 0;
+  double ssimy, ssimu, ssimv;
+  uint64_t psnry, psnru, psnrv;
+
+  if (argc < 4) {
+    fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n",
+            argv[0]);
+    return 1;
+  }
+  f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin;
+  f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin;
+  sscanf(argv[3], "%dx%d", &w, &h);
+  // Number of frames to skip from file1.yuv for every frame used. Normal values
+  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
+  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+  if (argc > 4) {
+    sscanf(argv[4], "%d", &tl_skip);
+  }
+  if (!f[0] || !f[1]) {
+    fprintf(stderr, "Could not open input files: %s\n", strerror(errno));
+    return 1;
+  }
+  if (w <= 0 || h <= 0 || w & 1 || h & 1) {
+    fprintf(stderr, "Invalid size %dx%d\n", w, h);
+    return 1;
+  }
+  buf[0] = malloc(w * h * 3 / 2);
+  buf[1] = malloc(w * h * 3 / 2);
+  n_frames = 0;
+  while (1) {
+    size_t r1, r2;
+    r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]);
+    if (r1) {
+      // Reading parts of file1.yuv that were not used in temporal layer.
+      if (tl_skips_remaining > 0) {
+        --tl_skips_remaining;
+        continue;
+      }
+      // Use frame, but skip |tl_skip| after it.
+      tl_skips_remaining = tl_skip;
+    }
+    r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]);
+    if (r1 && r2 && r1 != r2) {
+      fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
+              (int)r1, (int)r2);
+      return 1;
+    } else if (r1 == 0 || r2 == 0) {
+      break;
+    }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
+  ssim = vp8_ssim2(buf0, buf1, w, w, w, h);         \
+  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+    psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h);
+    psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2);
+    psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4,
+                  w / 2, h / 2);
+    ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv);
+    psnravg +=
+        vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv);
+    psnrglb += psnry + psnru + psnrv;
+    n_frames++;
+  }
+  free(buf[0]);
+  free(buf[1]);
+  ssim /= n_frames;
+  psnravg /= n_frames;
+  psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+
+  printf("AvgPSNR: %lf\n", psnravg);
+  printf("GlbPSNR: %lf\n", psnrglb);
+  printf("SSIM: %lf\n", 100 * pow(ssim, 8.0));
+  printf("Nframes: %d\n", n_frames);
+
+  if (strcmp(argv[1], "-")) fclose(f[0]);
+  if (strcmp(argv[2], "-")) fclose(f[1]);
+
+  return 0;
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fde0b7e31..628d1c8d2 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
     return 0;
   }
 
-  tile_data->xd.error_info = &tile_data->error_info;
   tile_data->xd.corrupted = 0;
 
   do {
@@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                         &tile_data->error_info, &tile_data->bit_reader,
                         pbi->decrypt_cb, pbi->decrypt_state);
     vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff);
+    // init resets xd.error_info
+    tile_data->xd.error_info = &tile_data->error_info;
 
     for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
          mi_row += MI_BLOCK_SIZE) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3bd6026d4..2a5800382 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2441,6 +2441,8 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
       cpi->resize_pending = 1;
       return 1;
     }
+    // Force recode if projected_frame_size > max_frame_bandwidth
+    if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
 
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2799,13 +2801,13 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d\n",
+        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
         cpi->rc.source_alt_ref_pending,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 2f1fe360d..788952d34 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -48,10 +48,8 @@
 #define FIRST_PASS_Q 10.0
 #define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
-#define KF_MAX_BOOST 128.0
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300
 #define NEW_MV_MODE_PENALTY 32
 #define SVC_FACTOR_PT_LOW 0.45
 #define DARK_THRESH 64
@@ -1578,7 +1576,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi,
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part -
                (INTRA_PART * modified_pcnt_intra);
   }
-  return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
@@ -1681,6 +1679,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 
 #define BASELINE_ERR_PER_MB 1000.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+                               double *sr_accumulator,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
@@ -1694,17 +1693,56 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
-                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+  // Small adjustment for cases where there is a zoom out
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+  // Q correction and scalling
   frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
-  // Increase boost for frames where new data coming into frame (e.g. zoom out).
-  // Slightly reduce boost if there is a net balance of motion out of the frame
-  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+#define KF_BOOST_FACTOR 12.5
+static double calc_kf_frame_boost(VP9_COMP *cpi,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator,
+                                  double this_frame_mv_in_out,
+                                  double max_boost) {
+  double frame_boost;
+  const double lq = vp9_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+  const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                       : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+  // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-  // In the extreme case the boost is halved.
-  else
-    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  // Q correction and scalling
+  frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction;
 
   return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
@@ -1719,6 +1757,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
+  double sr_accumulator = 0.0;
   int arf_boost;
   int flash_detected = 0;
 
@@ -1745,9 +1784,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               : decay_accumulator;
     }
 
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
   }
 
   *f_boost = (int)boost_score;
@@ -1759,6 +1799,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   this_frame_mv_in_out = 0.0;
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
+  sr_accumulator = 0.0;
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
@@ -1783,9 +1824,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               : decay_accumulator;
     }
 
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
   }
   *b_boost = (int)boost_score;
 
@@ -2085,7 +2127,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_ratio_accumulator = 0.0;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
-
   double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
 
@@ -2095,6 +2136,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_ratio_accumulator_thresh;
   double mv_in_out_thresh;
   double abs_mv_in_out_thresh;
+  double sr_accumulator = 0.0;
   unsigned int allow_alt_ref = is_altref_enabled(cpi);
 
   int f_boost = 0;
@@ -2221,9 +2263,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Calculate a boost number for this frame.
-    boost_score +=
-        decay_accumulator *
-        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+    sr_accumulator = 0.0;
+    boost_score += decay_accumulator *
+                   calc_frame_boost(cpi, &next_frame, &sr_accumulator,
+                                    this_frame_mv_in_out, GF_MAX_BOOST);
 
     // Break out conditions.
     if (
@@ -2473,6 +2516,10 @@ static int test_candidate_kf(TWO_PASS *twopass,
 }
 
 #define FRAMES_TO_CHECK_DECAY 8
+#define KF_MAX_FRAME_BOOST 96.0
+#define MIN_KF_TOT_BOOST 300
+#define MAX_KF_TOT_BOOST 5400
+#define KF_BOOST_SCAN_MAX_FRAMES 32
 
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
@@ -2485,14 +2532,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
-  int loop_decay_counter = 0;
   double decay_accumulator = 1.0;
-  double av_decay_accumulator = 0.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  double sr_accumulator = 0.0;
 
   vp9_zero(next_frame);
 
@@ -2642,34 +2688,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
-  decay_accumulator = 1.0;
   boost_score = 0.0;
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    // Monitor for static sections.
-    zero_motion_accumulator = VPXMIN(zero_motion_accumulator,
-                                     get_zero_motion_factor(cpi, &next_frame));
-
-    // Not all frames in the group are necessarily used in calculating boost.
-    if ((i <= rc->max_gf_interval) ||
-        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
-      const double frame_boost =
-          calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
-
-      // How fast is prediction quality decaying.
-      if (!detect_flash(twopass, 0)) {
-        const double loop_decay_rate =
-            get_prediction_decay_rate(cpi, &next_frame);
-        decay_accumulator *= loop_decay_rate;
-        decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR);
-        av_decay_accumulator += decay_accumulator;
-        ++loop_decay_counter;
-      }
-      boost_score += (decay_accumulator * frame_boost);
+    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+      double frame_boost;
+      double zm_factor;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+      // The second (lagging) ref error is not valid immediately after
+      // a key frame because either the lag has not built up (in the case of
+      // the first key frame or it points to a refernce before the new key
+      // frame.
+      if (i < 2) sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0,
+                                        KF_MAX_FRAME_BOOST * zm_factor);
+
+      boost_score += frame_boost;
+      if (frame_boost < 25.00) break;
+    } else {
+      break;
     }
   }
-  av_decay_accumulator /= (double)loop_decay_counter;
 
   reset_fpf_position(twopass, start_position);
 
@@ -2681,9 +2729,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
   // Apply various clamps for min and max boost
-  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-  rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST);
+  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 5aa0b8ddb..88b1531d8 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11,
 static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16,
                                                                   16, 16 };
 
+static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 };
+
+static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 };
+
 typedef enum {
   QUANTIZER = 0,
   BITRATE,
@@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
   char *token;
   const char *delim = ",";
   char *save_ptr;
+  int num_layers = svc_ctx->spatial_layers;
+  if (type == BITRATE)
+    num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers;
 
   if (input == NULL || option0 == NULL ||
       (option1 == NULL && type == SCALE_FACTOR))
@@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
 
   input_string = strdup(input);
   token = strtok_r(input_string, delim, &save_ptr);
-  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+  for (i = 0; i < num_layers; ++i) {
     if (token != NULL) {
       res = extract_option(type, token, option0 + i, option1 + i);
       if (res != VPX_CODEC_OK) break;
@@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
       break;
     }
   }
-  if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
+  if (res == VPX_CODEC_OK && i != num_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: layer params type: %d    %d values required, "
             "but only %d specified\n",
-            type, svc_ctx->spatial_layers, i);
+            type, num_layers, i);
     res = VPX_CODEC_INVALID_PARAM;
   }
   free(input_string);
@@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   return VPX_CODEC_OK;
 }
 
-void assign_layer_bitrates(const SvcContext *svc_ctx,
-                           vpx_codec_enc_cfg_t *const enc_cfg) {
+vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
+                                      vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
   const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   int sl, tl, spatial_layer_target;
 
   if (svc_ctx->temporal_layering_mode != 0) {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-        enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0;
+        total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers +
+                                      svc_ctx->temporal_layers - 1];
         for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
           enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] +=
               (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl];
           enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] =
               si->bitrates[sl * svc_ctx->temporal_layers + tl];
+          if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <=
+                         si->bitrates[sl * svc_ctx->temporal_layers + tl - 1]))
+            return VPX_CODEC_INVALID_PARAM;
         }
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
     }
   } else {
     if (si->bitrates[0] != 0) {
-      enc_cfg->rc_target_bitrate = 0;
+      unsigned int total_bitrate = 0;
       for (i = 0; i < svc_ctx->spatial_layers; ++i) {
         enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
-        enc_cfg->rc_target_bitrate += si->bitrates[i];
+        enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i];
+        total_bitrate += si->bitrates[i];
       }
+      if (total_bitrate != enc_cfg->rc_target_bitrate)
+        return VPX_CODEC_INVALID_PARAM;
     } else {
       float total = 0;
       float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
@@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx,
       }
     }
   }
+  return VPX_CODEC_OK;
 }
 
 vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
@@ -412,12 +429,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
     si->svc_params.speed_per_layer[sl] = svc_ctx->speed;
   }
-
+  if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS &&
+      svc_ctx->spatial_layers <= 3) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl;
+      si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2];
+      si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2];
+    }
+  }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
       i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
+      if (enc_cfg->rc_end_usage == VPX_CBR &&
+          enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+        si->svc_params.max_quantizers[i] = 56;
+        si->svc_params.min_quantizers[i] = 2;
+      }
     }
   }
 
@@ -442,7 +471,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
             (int)VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
-  assign_layer_bitrates(svc_ctx, enc_cfg);
+  res = assign_layer_bitrates(svc_ctx, enc_cfg);
+  if (res != VPX_CODEC_OK) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "layer bitrates incorrect: \n"
+            "1) spatial layer bitrates should sum up to target \n"
+            "2) temporal layer bitrates should be increasing within \n"
+            "a spatial layer \n");
+    return VPX_CODEC_INVALID_PARAM;
+  }
 
 #if CONFIG_SPATIAL_SVC
   for (i = 0; i < svc_ctx->spatial_layers; ++i)
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index c8bde5832..462785075 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -54,7 +54,7 @@ typedef struct SvcInternal {
   // values extracted from option, quantizers
   vpx_svc_extra_cfg_t svc_params;
   int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-  int bitrates[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_MAX_LAYERS];
 
   // accumulated statistics
   double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];  // total/Y/U/V
diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c
index 8aad4c579..6be4b0122 100644
--- a/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -103,7 +103,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
   dest_stride8 = dest_stride * 8;
   if (a1 >= 0) {  // diff_positive_32_32
     a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
+    q0u8 = vdupq_n_u8((uint8_t)a1);
     for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
       d = dest;
       for (j = 0; j < 4; j++) {
@@ -119,7 +119,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
   } else {  // diff_negative_32_32
     a1 = -a1;
     a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
-    q0u8 = vdupq_n_u8(a1);
+    q0u8 = vdupq_n_u8((uint8_t)a1);
     for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
       d = dest;
       for (j = 0; j < 4; j++) {
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
new file mode 100644
index 000000000..ebec9df54
--- /dev/null
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -0,0 +1,519 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the  _34_ variant. Since it only uses values from
+// the top left 8x8 it can safely assume all the remaining values are 0 and skip
+// an awful lot of calculations. In fact, only the first 6 columns make the cut.
+// None of the elements in the 7th or 8th column are used so it skips any calls
+// to input[67] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 8x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
+// coefficients as follows:
+//    0  1  2  3  4  5  6  7
+// 0  0  2  5 10 17 25
+// 1  1  4  8 15 22 30
+// 2  3  7 12 18 28
+// 3  6 11 16 23 31
+// 4  9 14 19 29
+// 5 13 20 26
+// 6 21 27 33
+// 7 24 32
+static void idct32_6_neon(const int16_t *input, int16_t *output) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+      s1_31;
+  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+      s2_31;
+  int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+  load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+                             &in6, &in7);
+
+  // stage 1
+  // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  // stage 3
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
+
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s2_20 = vsubq_s16(s1_23, s1_20);
+  s2_21 = vsubq_s16(s1_22, s1_21);
+  s2_22 = vaddq_s16(s1_21, s1_22);
+  s2_23 = vaddq_s16(s1_20, s1_23);
+  s2_24 = vaddq_s16(s1_24, s1_27);
+  s2_25 = vaddq_s16(s1_25, s1_26);
+  s2_26 = vsubq_s16(s1_25, s1_26);
+  s2_27 = vsubq_s16(s1_24, s1_27);
+
+  // stage 5
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30,
+                                                   cospi_8_64);
+
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31,
+                                                   cospi_8_64);
+
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s2_0 = vaddq_s16(s1_0, s1_7);
+  s2_1 = vaddq_s16(s1_0, s1_6);
+  s2_2 = vaddq_s16(s1_0, s1_5);
+  s2_3 = vaddq_s16(s1_0, s1_4);
+  s2_4 = vsubq_s16(s1_0, s1_4);
+  s2_5 = vsubq_s16(s1_0, s1_5);
+  s2_6 = vsubq_s16(s1_0, s1_6);
+  s2_7 = vsubq_s16(s1_0, s1_7);
+
+  s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64);
+
+  s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64);
+
+  s2_16 = vaddq_s16(s1_16, s2_23);
+  s2_17 = vaddq_s16(s1_17, s2_22);
+  s2_18 = vaddq_s16(s1_18, s1_21);
+  s2_19 = vaddq_s16(s1_19, s1_20);
+  s2_20 = vsubq_s16(s1_19, s1_20);
+  s2_21 = vsubq_s16(s1_18, s1_21);
+  s2_22 = vsubq_s16(s1_17, s2_22);
+  s2_23 = vsubq_s16(s1_16, s2_23);
+
+  s3_24 = vsubq_s16(s1_31, s2_24);
+  s3_25 = vsubq_s16(s1_30, s2_25);
+  s3_26 = vsubq_s16(s1_29, s1_26);
+  s3_27 = vsubq_s16(s1_28, s1_27);
+  s2_28 = vaddq_s16(s1_27, s1_28);
+  s2_29 = vaddq_s16(s1_26, s1_29);
+  s2_30 = vaddq_s16(s2_25, s1_30);
+  s2_31 = vaddq_s16(s2_24, s1_31);
+
+  // stage 7
+  s1_0 = vaddq_s16(s2_0, s2_15);
+  s1_1 = vaddq_s16(s2_1, s2_14);
+  s1_2 = vaddq_s16(s2_2, s2_13);
+  s1_3 = vaddq_s16(s2_3, s2_12);
+  s1_4 = vaddq_s16(s2_4, s2_11);
+  s1_5 = vaddq_s16(s2_5, s2_10);
+  s1_6 = vaddq_s16(s2_6, s2_9);
+  s1_7 = vaddq_s16(s2_7, s2_8);
+  s1_8 = vsubq_s16(s2_7, s2_8);
+  s1_9 = vsubq_s16(s2_6, s2_9);
+  s1_10 = vsubq_s16(s2_5, s2_10);
+  s1_11 = vsubq_s16(s2_4, s2_11);
+  s1_12 = vsubq_s16(s2_3, s2_12);
+  s1_13 = vsubq_s16(s2_2, s2_13);
+  s1_14 = vsubq_s16(s2_1, s2_14);
+  s1_15 = vsubq_s16(s2_0, s2_15);
+
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+  s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64);
+
+  s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64);
+
+  // final stage
+  vst1q_s16(output, vaddq_s16(s1_0, s2_31));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_1, s2_30));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_2, s2_29));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_3, s2_28));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_4, s1_27));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_5, s1_26));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_6, s1_25));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_7, s1_24));
+  output += 8;
+
+  vst1q_s16(output, vaddq_s16(s1_8, s1_23));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_9, s1_22));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_10, s1_21));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_11, s1_20));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_12, s2_19));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_13, s2_18));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_14, s2_17));
+  output += 8;
+  vst1q_s16(output, vaddq_s16(s1_15, s2_16));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1_15, s2_16));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_14, s2_17));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_13, s2_18));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_12, s2_19));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_11, s1_20));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_10, s1_21));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_9, s1_22));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_8, s1_23));
+  output += 8;
+
+  vst1q_s16(output, vsubq_s16(s1_7, s1_24));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_6, s1_25));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_5, s1_26));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_4, s1_27));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_3, s2_28));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_2, s2_29));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_1, s2_30));
+  output += 8;
+  vst1q_s16(output, vsubq_s16(s1_0, s2_31));
+}
+
+static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
+  int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+  int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+  int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
+      s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
+      s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30,
+      s1_31;
+  int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10,
+      s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20,
+      s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30,
+      s2_31;
+  int16x8_t s3_24, s3_25, s3_26, s3_27;
+
+  load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6,
+                             &in7);
+
+  // stage 1
+  s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+  s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+  // Different for _8_
+  s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+  s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+  s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+  s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+  s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+  s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+  // stage 2
+  s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+  s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+  s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+  s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+  // stage 3
+  s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+  s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+  s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+                                                   cospi_28_64);
+  s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+                                                   cospi_4_64);
+
+  // Different for _8_
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28,
+                                                   -cospi_4_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28,
+                                                   cospi_28_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27,
+                                                   cospi_12_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27,
+                                                   cospi_20_64);
+
+  s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+                                                   -cospi_20_64);
+  s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+                                                   cospi_12_64);
+
+  // stage 4
+  s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+
+  s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+                                                  cospi_24_64);
+  s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+                                                   cospi_8_64);
+
+  s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12,
+                                                   -cospi_8_64);
+  s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12,
+                                                   cospi_24_64);
+
+  s2_16 = vaddq_s16(s1_16, s1_19);
+
+  s2_17 = vaddq_s16(s1_17, s1_18);
+  s2_18 = vsubq_s16(s1_17, s1_18);
+
+  s2_19 = vsubq_s16(s1_16, s1_19);
+
+  s2_20 = vsubq_s16(s1_23, s1_20);
+  s2_21 = vsubq_s16(s1_22, s1_21);
+
+  s2_22 = vaddq_s16(s1_21, s1_22);
+  s2_23 = vaddq_s16(s1_20, s1_23);
+
+  s2_24 = vaddq_s16(s1_24, s1_27);
+  s2_25 = vaddq_s16(s1_25, s1_26);
+  s2_26 = vsubq_s16(s1_25, s1_26);
+  s2_27 = vsubq_s16(s1_24, s1_27);
+
+  s2_28 = vsubq_s16(s1_31, s1_28);
+  s2_29 = vsubq_s16(s1_30, s1_29);
+  s2_30 = vaddq_s16(s1_29, s1_30);
+  s2_31 = vaddq_s16(s1_28, s1_31);
+
+  // stage 5
+  s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64);
+  s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64);
+
+  s1_8 = vaddq_s16(s2_8, s2_11);
+  s1_9 = vaddq_s16(s2_9, s2_10);
+  s1_10 = vsubq_s16(s2_9, s2_10);
+  s1_11 = vsubq_s16(s2_8, s2_11);
+  s1_12 = vsubq_s16(s2_15, s2_12);
+  s1_13 = vsubq_s16(s2_14, s2_13);
+  s1_14 = vaddq_s16(s2_13, s2_14);
+  s1_15 = vaddq_s16(s2_12, s2_15);
+
+  s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29,
+                                                   cospi_24_64);
+  s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29,
+                                                   cospi_8_64);
+
+  s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28,
+                                                   cospi_24_64);
+  s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28,
+                                                   cospi_8_64);
+
+  s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27,
+                                                   -cospi_8_64);
+  s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27,
+                                                   cospi_24_64);
+
+  s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26,
+                                                   -cospi_8_64);
+  s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26,
+                                                   cospi_24_64);
+
+  // stage 6
+  s2_0 = vaddq_s16(s1_0, s1_7);
+  s2_1 = vaddq_s16(s1_0, s1_6);
+  s2_2 = vaddq_s16(s1_0, s1_5);
+  s2_3 = vaddq_s16(s1_0, s1_4);
+  s2_4 = vsubq_s16(s1_0, s1_4);
+  s2_5 = vsubq_s16(s1_0, s1_5);
+  s2_6 = vsubq_s16(s1_0, s1_6);
+  s2_7 = vsubq_s16(s1_0, s1_7);
+
+  s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64);
+  s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64);
+
+  s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64);
+  s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64);
+
+  s1_16 = vaddq_s16(s2_16, s2_23);
+  s1_17 = vaddq_s16(s2_17, s2_22);
+  s2_18 = vaddq_s16(s1_18, s1_21);
+  s2_19 = vaddq_s16(s1_19, s1_20);
+  s2_20 = vsubq_s16(s1_19, s1_20);
+  s2_21 = vsubq_s16(s1_18, s1_21);
+  s1_22 = vsubq_s16(s2_17, s2_22);
+  s1_23 = vsubq_s16(s2_16, s2_23);
+
+  s3_24 = vsubq_s16(s2_31, s2_24);
+  s3_25 = vsubq_s16(s2_30, s2_25);
+  s3_26 = vsubq_s16(s1_29, s1_26);
+  s3_27 = vsubq_s16(s1_28, s1_27);
+  s2_28 = vaddq_s16(s1_27, s1_28);
+  s2_29 = vaddq_s16(s1_26, s1_29);
+  s2_30 = vaddq_s16(s2_25, s2_30);
+  s2_31 = vaddq_s16(s2_24, s2_31);
+
+  // stage 7
+  s1_0 = vaddq_s16(s2_0, s1_15);
+  s1_1 = vaddq_s16(s2_1, s1_14);
+  s1_2 = vaddq_s16(s2_2, s2_13);
+  s1_3 = vaddq_s16(s2_3, s2_12);
+  s1_4 = vaddq_s16(s2_4, s2_11);
+  s1_5 = vaddq_s16(s2_5, s2_10);
+  s1_6 = vaddq_s16(s2_6, s1_9);
+  s1_7 = vaddq_s16(s2_7, s1_8);
+  s1_8 = vsubq_s16(s2_7, s1_8);
+  s1_9 = vsubq_s16(s2_6, s1_9);
+  s1_10 = vsubq_s16(s2_5, s2_10);
+  s1_11 = vsubq_s16(s2_4, s2_11);
+  s1_12 = vsubq_s16(s2_3, s2_12);
+  s1_13 = vsubq_s16(s2_2, s2_13);
+  s1_14 = vsubq_s16(s2_1, s1_14);
+  s1_15 = vsubq_s16(s2_0, s1_15);
+
+  s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64);
+  s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64);
+
+  s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64);
+  s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64);
+
+  s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64);
+  s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64);
+
+  s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64);
+  s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64);
+
+  // final stage
+  out0 = vaddq_s16(s1_0, s2_31);
+  out1 = vaddq_s16(s1_1, s2_30);
+  out2 = vaddq_s16(s1_2, s2_29);
+  out3 = vaddq_s16(s1_3, s2_28);
+  out4 = vaddq_s16(s1_4, s1_27);
+  out5 = vaddq_s16(s1_5, s1_26);
+  out6 = vaddq_s16(s1_6, s1_25);
+  out7 = vaddq_s16(s1_7, s1_24);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+                       stride);
+
+  out0 = vaddq_s16(s1_8, s2_23);
+  out1 = vaddq_s16(s1_9, s2_22);
+  out2 = vaddq_s16(s1_10, s1_21);
+  out3 = vaddq_s16(s1_11, s1_20);
+  out4 = vaddq_s16(s1_12, s2_19);
+  out5 = vaddq_s16(s1_13, s2_18);
+  out6 = vaddq_s16(s1_14, s1_17);
+  out7 = vaddq_s16(s1_15, s1_16);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (8 * stride), stride);
+
+  out0 = vsubq_s16(s1_15, s1_16);
+  out1 = vsubq_s16(s1_14, s1_17);
+  out2 = vsubq_s16(s1_13, s2_18);
+  out3 = vsubq_s16(s1_12, s2_19);
+  out4 = vsubq_s16(s1_11, s1_20);
+  out5 = vsubq_s16(s1_10, s1_21);
+  out6 = vsubq_s16(s1_9, s2_22);
+  out7 = vsubq_s16(s1_8, s2_23);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (16 * stride), stride);
+
+  out0 = vsubq_s16(s1_7, s1_24);
+  out1 = vsubq_s16(s1_6, s1_25);
+  out2 = vsubq_s16(s1_5, s1_26);
+  out3 = vsubq_s16(s1_4, s1_27);
+  out4 = vsubq_s16(s1_3, s2_28);
+  out5 = vsubq_s16(s1_2, s2_29);
+  out6 = vsubq_s16(s1_1, s2_30);
+  out7 = vsubq_s16(s1_0, s2_31);
+
+  add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+                       output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+                               int stride) {
+  int i;
+  int16_t temp[32 * 8];
+  int16_t *t = temp;
+
+  idct32_6_neon(input, t);
+
+  for (i = 0; i < 32; i += 8) {
+    idct32_8_neon(t, dest, stride);
+    t += (8 * 8);
+    dest += 8;
+  }
+}
diff --git a/vpx_dsp/arm/idct4x4_add_neon.asm b/vpx_dsp/arm/idct4x4_add_neon.asm
index a4ccba993..bd4e86ded 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -15,6 +15,8 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     AREA     Block, CODE, READONLY ; name this block of code
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
 ;
@@ -33,7 +35,7 @@
     ; So, two passes of a transpose followed by a column transform.
 
     ; load the inputs into q8-q9, d16-d19
-    vld1.s16        {q8,q9}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
 
     ; generate scalar constants
     ; cospi_8_64 = 15137
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 24b91fe48..8f669c907 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -28,8 +29,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
 
   d26u32 = d27u32 = vdup_n_u32(0);
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
 
   d16s16 = vget_low_s16(q8s16);
   d17s16 = vget_high_s16(q8s16);
diff --git a/vpx_dsp/arm/idct8x8_add_neon.asm b/vpx_dsp/arm/idct8x8_add_neon.asm
index 21e75951e..a5c9c927d 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -16,6 +16,8 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
+    INCLUDE vpx_dsp/arm/idct_neon.asm.S
+
     ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
     ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
     ; This macro will touch q0-q7 registers and use them as buffer during
@@ -207,10 +209,10 @@
 |vpx_idct8x8_64_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -312,10 +314,10 @@
 |vpx_idct8x8_12_add_neon| PROC
     push            {r4-r9}
     vpush           {d8-d15}
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
 
     ; transpose the input data
     TRANSPOSE8X8
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index d73feebec..159a6ec98 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
@@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
   uint16x8_t q8u16, q9u16, q10u16, q11u16;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
+  q10s16 = load_tran_low_to_s16(input + 16);
+  q11s16 = load_tran_low_to_s16(input + 24);
+  q12s16 = load_tran_low_to_s16(input + 32);
+  q13s16 = load_tran_low_to_s16(input + 40);
+  q14s16 = load_tran_low_to_s16(input + 48);
+  q15s16 = load_tran_low_to_s16(input + 56);
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
@@ -279,14 +280,14 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   uint16x8_t q8u16, q9u16, q10u16, q11u16;
   int32x4_t q9s32, q10s32, q11s32, q12s32;
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 16);
-  q11s16 = vld1q_s16(input + 24);
-  q12s16 = vld1q_s16(input + 32);
-  q13s16 = vld1q_s16(input + 40);
-  q14s16 = vld1q_s16(input + 48);
-  q15s16 = vld1q_s16(input + 56);
+  q8s16 = load_tran_low_to_s16(input);
+  q9s16 = load_tran_low_to_s16(input + 8);
+  q10s16 = load_tran_low_to_s16(input + 16);
+  q11s16 = load_tran_low_to_s16(input + 24);
+  q12s16 = load_tran_low_to_s16(input + 32);
+  q13s16 = load_tran_low_to_s16(input + 40);
+  q14s16 = load_tran_low_to_s16(input + 48);
+  q15s16 = load_tran_low_to_s16(input + 56);
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm
new file mode 100644
index 000000000..a223c0b63
--- /dev/null
+++ b/vpx_dsp/arm/idct_neon.asm
@@ -0,0 +1,29 @@
+;
+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    INCLUDE ./vpx_config.asm
+
+    ; Helper function used to load tran_low_t into int16, narrowing if
+    ; necessary.
+    ; $dst0..3 are d registers with the pairs assumed to be contiguous in
+    ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
+    MACRO
+    LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src
+    IF CONFIG_VP9_HIGHBITDEPTH
+    vld1.s32        {q0,q1}, [$src]!
+    vld1.s32        {q2,q3}, [$src]!
+    vmovn.i32       $dst0, q0
+    vmovn.i32       $dst1, q1
+    vmovn.i32       $dst2, q2
+    vmovn.i32       $dst3, q3
+    ELSE
+    vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!
+    ENDIF
+    MEND
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
new file mode 100644
index 000000000..5c2a53c03
--- /dev/null
+++ b/vpx_dsp/arm/idct_neon.h
@@ -0,0 +1,172 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_DSP_ARM_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+//------------------------------------------------------------------------------
+
+// Helper function used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+// Multiply a by a_const. Saturate, shift and narrow by 14.
+static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
+                                                      const int16_t a_const) {
+  // Shift by 14 + rounding will be within 16 bits for well formed streams.
+  // See WRAPLOW and dct_const_round_shift for details.
+  // This instruction doubles the result and returns the high half, essentially
+  // resulting in a right shift by 15. By multiplying the constant first that
+  // becomes a right shift by 14.
+  // The largest possible value used here is
+  // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
+  // within the range of int16_t (+32767 / -32768) even when negated.
+  return vqrdmulhq_n_s16(a, a_const * 2);
+}
+
+// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  // In both add_ and it's pair, sub_, the input for well-formed streams will be
+  // well within 16 bits (input to the idct is the difference between two frames
+  // and will be within -255 to 255, or 9 bits)
+  // However, for inputs over about 25,000 (valid for int16_t, but not for idct
+  // input) this function can not use vaddq_s16.
+  // In order to match existing behavior and intentionally out of range tests,
+  // expand the addition up to 32 bits to prevent truncation.
+  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
+    const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
+  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  temp_low = vmulq_n_s32(temp_low, ab_const);
+  temp_high = vmulq_n_s32(temp_high, ab_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
+// 14.
+static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
+    const int16x8_t a, const int16_t a_const, const int16x8_t b,
+    const int16_t b_const) {
+  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
+  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
+  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
+  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
+  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+}
+
+static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,
+                                              int16x8_t *a0, int16x8_t *a1,
+                                              int16x8_t *a2, int16x8_t *a3,
+                                              int16x8_t *a4, int16x8_t *a5,
+                                              int16x8_t *a6, int16x8_t *a7) {
+  *a0 = vld1q_s16(a);
+  a += a_stride;
+  *a1 = vld1q_s16(a);
+  a += a_stride;
+  *a2 = vld1q_s16(a);
+  a += a_stride;
+  *a3 = vld1q_s16(a);
+  a += a_stride;
+  *a4 = vld1q_s16(a);
+  a += a_stride;
+  *a5 = vld1q_s16(a);
+  a += a_stride;
+  *a6 = vld1q_s16(a);
+  a += a_stride;
+  *a7 = vld1q_s16(a);
+
+  transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
+}
+
+// Shift the output down by 6 and add it to the destination buffer.
+static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
+                                        const int16x8_t a2, const int16x8_t a3,
+                                        const int16x8_t a4, const int16x8_t a5,
+                                        const int16x8_t a6, const int16x8_t a7,
+                                        uint8_t *b, const int b_stride) {
+  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  b0 = vld1_u8(b);
+  b += b_stride;
+  b1 = vld1_u8(b);
+  b += b_stride;
+  b2 = vld1_u8(b);
+  b += b_stride;
+  b3 = vld1_u8(b);
+  b += b_stride;
+  b4 = vld1_u8(b);
+  b += b_stride;
+  b5 = vld1_u8(b);
+  b += b_stride;
+  b6 = vld1_u8(b);
+  b += b_stride;
+  b7 = vld1_u8(b);
+  b -= (7 * b_stride);
+
+  // c = b + (a >> 6)
+  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
+  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
+  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
+  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
+  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
+  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
+  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
+  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
+
+  b0 = vqmovun_s16(c0);
+  b1 = vqmovun_s16(c1);
+  b2 = vqmovun_s16(c2);
+  b3 = vqmovun_s16(c3);
+  b4 = vqmovun_s16(c4);
+  b5 = vqmovun_s16(c5);
+  b6 = vqmovun_s16(c6);
+  b7 = vqmovun_s16(c7);
+
+  vst1_u8(b, b0);
+  b += b_stride;
+  vst1_u8(b, b1);
+  b += b_stride;
+  vst1_u8(b, b2);
+  b += b_stride;
+  vst1_u8(b, b3);
+  b += b_stride;
+  vst1_u8(b, b4);
+  b += b_stride;
+  vst1_u8(b, b5);
+  b += b_stride;
+  vst1_u8(b, b6);
+  b += b_stride;
+  vst1_u8(b, b7);
+}
+#endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 38e79ed69..e150a5302 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -17,306 +17,254 @@
 //------------------------------------------------------------------------------
 // DC 4x4
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x4_t sum_top;
-  uint16x4_t sum_left;
-  uint16x4_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    sum_top = vpadd_u16(p0, p0);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    sum_left = vpadd_u16(p0, p0);
-  }
-
-  if (do_above && do_left) {
-    const uint16x4_t sum = vadd_u16(sum_left, sum_top);
-    dc0 = vrshr_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshr_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshr_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u16(0x80);
-  }
+static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  const uint16x4_t p0 = vpaddl_u8(ref_u8);
+  return vpadd_u16(p0, p0);
+}
 
-  {
-    const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
+static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
+  const uint8x8_t a = vld1_u8(above);
+  const uint8x8_t l = vld1_u8(left);
+  const uint16x8_t al = vaddl_u8(a, l);
+  uint16x4_t sum;
+  uint8x8_t dc;
+  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
   (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+  dc_store_4x4(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 8x8
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
+  const uint8x8_t ref_u8 = vld1_u8(ref);
+  uint16x4_t sum = vpaddl_u8(ref_u8);
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
+static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
+                                const uint8x8_t dc) {
+  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    vst1_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
+  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
   (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+  dc_store_8x8(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 16x16
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
+  const uint8x16_t ref_u8 = vld1q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
+static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    vst1q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
+  const uint8x16_t ref0 = vld1q_u8(above);
+  const uint8x16_t ref1 = vld1q_u8(left);
+  const uint16x8_t p0 = vpaddlq_u8(ref0);
+  const uint16x8_t p1 = vpaddlq_u8(ref1);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_16(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
   (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+  dc_store_16x16(dst, stride, dc);
 }
 
 //------------------------------------------------------------------------------
 // DC 32x32
 
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
-
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
+static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16x2_t r = vld2q_u8(ref);
+  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
+  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
+  const uint16x8_t p2 = vaddq_u16(p0, p1);
+  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+  sum = vpadd_u16(sum, sum);
+  return vpadd_u16(sum, sum);
+}
 
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
+static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8x8_t dc) {
+  uint8x16x2_t dc_dup;
+  int i;
+  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
 
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
+  for (i = 0; i < 32; ++i, dst += stride) {
+    vst2q_u8(dst, dc_dup);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
+  const uint8x16x2_t a = vld2q_u8(above);
+  const uint8x16x2_t l = vld2q_u8(left);
+  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
+  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
+  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
+  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
+  const uint16x8_t pa = vaddq_u16(pa0, pa1);
+  const uint16x8_t pl = vaddq_u16(pl0, pl1);
+  const uint16x8_t pal = vaddq_u16(pa, pl);
+  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
+  uint8x8_t dc;
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(left);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint16x4_t sum = dc_sum_32(above);
+  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
   (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
+  const uint8x8_t dc = vdup_n_u8(0x80);
   (void)above;
   (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+  dc_store_32x32(dst, stride, dc);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
-  const uint64x1_t A1 = vshr_n_u64(A0, 8);
-  const uint64x1_t A2 = vshr_n_u64(A0, 16);
-  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t ABCDEFGH = vld1_u8(above);
+  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
+  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
@@ -331,485 +279,506 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
   vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
   vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  dst[3 * stride + 3] = above[7];
+  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
+                               const uint8x8_t above_right, uint8x8_t *row) {
+  *row = vext_u8(*row, above_right, 1);
+  vst1_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
-  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
-  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
-  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
-  const uint8x8_t A0 = vld1_u8(above);  // top row
-  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
-  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
+  const uint8x8_t A0 = vld1_u8(above);
+  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
+  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
+  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
   const uint8x8_t avg1 = vhadd_u8(A0, A2);
   uint8x8_t row = vrhadd_u8(avg1, A1);
-  int i;
   (void)left;
-  for (i = 0; i < 7; ++i) {
-    vst1_u8(dst + i * stride, row);
-    row = vtbl1_u8(row, sh_12345677);
-  }
-  vst1_u8(dst + i * stride, row);
+
+  vst1_u8(dst, row);
+  dst += stride;
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  d45_store_8(&dst, stride, above_right, &row);
+  vst1_u8(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
+                                const uint8x16_t above_right, uint8x16_t *row) {
+  *row = vextq_u8(*row, above_right, 1);
+  vst1q_u8(*dst, *row);
+  *dst += stride;
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);  // top row
-  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
+  const uint8x16_t A0 = vld1q_u8(above);
+  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
   const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
   const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
   const uint8x16_t avg1 = vhaddq_u8(A0, A2);
   uint8x16_t row = vrhaddq_u8(avg1, A1);
-  int i;
   (void)left;
-  for (i = 0; i < 15; ++i) {
-    vst1q_u8(dst + i * stride, row);
-    row = vextq_u8(row, above_right, 1);
-  }
-  vst1q_u8(dst + i * stride, row);
+
+  vst1q_u8(dst, row);
+  dst += stride;
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  d45_store_16(&dst, stride, above_right, &row);
+  vst1q_u8(dst, above_right);
 }
 
 // -----------------------------------------------------------------------------
 
 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint8x8_t XABCD = vld1_u8(above - 1);
   const uint32x2_t zero = vdup_n_u32(0);
   const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL));
+  const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4);
+  const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5);
+  const uint8x8_t JIXABCD0 =
+      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8));
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD);
   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+  vst1_lane_u32((uint32_t *)dst, r0, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r1, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r2, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, r3, 0);
 }
 
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t d = *(const uint32_t *)above;
   int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
   (void)left;
 
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride) {
+    *(uint32_t *)dst = d;
+  }
 }
 
 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d = vld1_u8(above);
   int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
   (void)left;
 
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+  for (i = 0; i < 8; i++, dst += stride) {
+    vst1_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d = vld1q_u8(above);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+  for (i = 0; i < 16; i++, dst += stride) {
+    vst1q_u8(dst, d);
+  }
 }
 
 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
   int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
   (void)left;
 
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
+  for (i = 0; i < 32; i++) {
+    // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
+    // clang-3.8 unrolled the loop fully with no filler so the cause is likely
+    // the latency of the instruction.
+    vst1q_u8(dst, d0);
+    dst += 16;
+    vst1q_u8(dst, d1);
+    dst += stride - 16;
   }
 }
 
+// -----------------------------------------------------------------------------
+
 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint8x8_t left_u8 =
+      vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
+  uint8x8_t d;
   (void)above;
 
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
 }
 
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  uint8x8_t d;
   (void)above;
 
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 0);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 1);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 2);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 3);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 4);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 5);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 6);
+  vst1_u8(dst, d);
   dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
+  d = vdup_lane_u8(left_u8, 7);
+  vst1_u8(dst, d);
 }
 
 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  const uint8x16_t left_u8q = vld1q_u8(left);
+  uint8x8_t left_u8d = vget_low_u8(left_u8q);
+  uint8x16_t d;
+  int i;
   (void)above;
 
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
+  for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
+    d = vdupq_lane_u8(left_u8d, 0);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 1);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 2);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 3);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 4);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 5);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 6);
+    vst1q_u8(dst, d);
     dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
+    d = vdupq_lane_u8(left_u8d, 7);
+    vst1q_u8(dst, d);
     dst += stride;
   }
 }
 
 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
+  uint8x16_t d;
+  int i;
   (void)above;
 
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
+  for (i = 0; i < 2; i++, left += 16) {
+    const uint8x16_t left_u8 = vld1q_u8(left);
+    const uint8x8_t left_low = vget_low_u8(left_u8);
+    const uint8x8_t left_high = vget_high_u8(left_u8);
+    d = vdupq_lane_u8(left_low, 0);
+    vst1q_u8(dst, d);  // Note clang-3.8 produced poor code w/vst2q_u8
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 1);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 2);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 3);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 4);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 5);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 6);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_low, 7);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+
+    d = vdupq_lane_u8(left_high, 0);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 1);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 2);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 3);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 4);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 5);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 6);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
+    d = vdupq_lane_u8(left_high, 7);
+    vst1q_u8(dst, d);
+    dst += 16;
+    vst1q_u8(dst, d);
+    dst += stride - 16;
   }
 }
 
+// -----------------------------------------------------------------------------
+
+static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
+}
+
 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint16x8_t q1u16, q3u16;
-  int16x8_t q1s16;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d2u32 = vdup_n_u32(0);
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-  for (i = 0; i < 4; i++, dst += stride) {
-    q1u16 = vdupq_n_u16((uint16_t)left[i]);
-    q1s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16));
-    d0u8 = vqmovun_s16(q1s16);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  }
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
+  int16x8_t sub, sum;
+  uint32x2_t d;
+
+  sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
+  sub = vreinterpretq_s16_s64(
+      vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+  dst += stride;
+
+  sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+  sum = vaddq_s16(sum, sub);
+  d = vreinterpret_u32_u8(vqmovun_s16(sum));
+  vst1_lane_u32((uint32_t *)dst, d, 0);
+  dst += stride;
+  vst1_lane_u32((uint32_t *)dst, d, 1);
+}
+
+static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
+                               const int16x8_t left_dup, const int16x8_t sub) {
+  const int16x8_t sum = vaddq_s16(left_dup, sub);
+  const uint8x8_t d = vqmovun_s16(sum);
+  vst1_u8(*dst, d);
+  *dst += stride;
 }
 
 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint16x8_t q0u16, q3u16, q10u16;
-  int16x8_t q0s16;
-  uint16x4_t d20u16;
-  uint8x8_t d0u8, d2u8, d30u8;
-
-  d0u8 = vld1_dup_u8(above - 1);
-  d30u8 = vld1_u8(left);
-  d2u8 = vld1_u8(above);
-  q10u16 = vmovl_u8(d30u8);
-  q3u16 = vsubl_u8(d2u8, d0u8);
-  d20u16 = vget_low_u16(q10u16);
-  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-    q0u16 = vdupq_lane_u16(d20u16, 0);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 1);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 2);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
-    q0u16 = vdupq_lane_u16(d20u16, 3);
-    q0s16 =
-        vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16));
-    d0u8 = vqmovun_s16(q0s16);
-    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-    dst += stride;
+  const uint8x8_t top_left = vld1_dup_u8(above - 1);
+  const uint8x8_t above_u8 = vld1_u8(above);
+  const uint8x8_t left_u8 = vld1_u8(left);
+  const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+  const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
+  int16x4_t left_s16d = vget_low_s16(left_s16q);
+  int i;
+
+  for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+    int16x8_t left_dup;
+
+    left_dup = vdupq_lane_s16(left_s16d, 0);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 1);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 2);
+    tm_8_kernel(&dst, stride, left_dup, sub);
+    left_dup = vdupq_lane_s16(left_s16d, 3);
+    tm_8_kernel(&dst, stride, left_dup, sub);
   }
 }
 
+static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  vst1_u8(*dst, d0);
+  *dst += 8;
+  vst1_u8(*dst, d1);
+  *dst += stride - 8;
+}
+
 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-  uint8x16_t q0u8, q1u8;
-  int16x8_t q0s16, q1s16, q8s16, q11s16;
-  uint16x4_t d20u16;
-  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  for (k = 0; k < 2; k++, left += 8) {
-    d18u8 = vld1_u8(left);
-    q10u16 = vmovl_u8(d18u8);
-    d20u16 = vget_low_u16(q10u16);
-    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-      q0u16 = vdupq_lane_u16(d20u16, 0);
-      q8u16 = vdupq_lane_u16(d20u16, 1);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d20u16, 2);
-      q8u16 = vdupq_lane_u16(d20u16, 3);
-      q1s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16));
-      q0s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16));
-      q11s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16));
-      q8s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16));
-      d2u8 = vqmovun_s16(q1s16);
-      d3u8 = vqmovun_s16(q0s16);
-      d22u8 = vqmovun_s16(q11s16);
-      d23u8 = vqmovun_s16(q8s16);
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-      dst += stride;
-      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-      dst += stride;
-    }
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_u8 = vld1q_u8(above);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    const int16x4_t left_low = vget_low_s16(left_s16q);
+    const int16x4_t left_high = vget_high_s16(left_s16q);
+
+    left_dup = vdupq_lane_s16(left_low, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_low, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+
+    left_dup = vdupq_lane_s16(left_high, 0);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 1);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 2);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
+    left_dup = vdupq_lane_s16(left_high, 3);
+    tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
   }
 }
 
+static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
+                                const int16x8_t left_dup, const int16x8_t sub0,
+                                const int16x8_t sub1, const int16x8_t sub2,
+                                const int16x8_t sub3) {
+  const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+  const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+  const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+  const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+  const uint8x8_t d0 = vqmovun_s16(sum0);
+  const uint8x8_t d1 = vqmovun_s16(sum1);
+  const uint8x8_t d2 = vqmovun_s16(sum2);
+  const uint8x8_t d3 = vqmovun_s16(sum3);
+
+  vst1q_u8(*dst, vcombine_u8(d0, d1));
+  *dst += 16;
+  vst1q_u8(*dst, vcombine_u8(d2, d3));
+  *dst += stride - 16;
+}
+
 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-  uint8x16_t q0u8, q1u8, q2u8;
-  int16x8_t q12s16, q13s16, q14s16, q15s16;
-  uint16x4_t d6u16;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
-  q0u8 = vld1q_dup_u8(above - 1);
-  q1u8 = vld1q_u8(above);
-  q2u8 = vld1q_u8(above + 16);
-  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-  for (k = 0; k < 4; k++, left += 8) {
-    d26u8 = vld1_u8(left);
-    q3u16 = vmovl_u8(d26u8);
-    d6u16 = vget_low_u16(q3u16);
-    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-      q0u16 = vdupq_lane_u16(d6u16, 0);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 1);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 2);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
-
-      q0u16 = vdupq_lane_u16(d6u16, 3);
-      q12s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16));
-      q13s16 =
-          vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16));
-      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q10u16));
-      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                         vreinterpretq_s16_u16(q11u16));
-      d0u8 = vqmovun_s16(q12s16);
-      d1u8 = vqmovun_s16(q13s16);
-      d2u8 = vqmovun_s16(q14s16);
-      d3u8 = vqmovun_s16(q15s16);
-      q0u8 = vcombine_u8(d0u8, d1u8);
-      q1u8 = vcombine_u8(d2u8, d3u8);
-      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-      dst += stride;
+  const uint8x16_t top_left = vld1q_dup_u8(above - 1);
+  const uint8x16_t above_low = vld1q_u8(above);
+  const uint8x16_t above_high = vld1q_u8(above + 16);
+  const int16x8_t sub0 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
+  const int16x8_t sub1 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
+  const int16x8_t sub2 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
+  const int16x8_t sub3 = vreinterpretq_s16_u16(
+      vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
+  int16x8_t left_dup;
+  int i, j;
+
+  for (j = 0; j < 4; j++, left += 8) {
+    const uint8x8_t left_u8 = vld1_u8(left);
+    const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
+    int16x4_t left_s16d = vget_low_s16(left_s16q);
+    for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+      left_dup = vdupq_lane_s16(left_s16d, 0);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 1);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 2);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
+      left_dup = vdupq_lane_s16(left_s16d, 3);
+      tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
     }
   }
 }
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dad34e4b4..8c91b141f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -204,18 +204,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
 else
 ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_add_neon.c
 DSP_SRCS-yes  += arm/idct16x16_add_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 DSP_SRCS-$(HAVE_NEON)  += arm/idct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/idct32x32_34_add_neon.c
 
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
@@ -232,14 +229,20 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct_neon$(ASM)
 DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
 DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
 else
 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
 endif  # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
 
 endif  # CONFIG_VP9
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fa6294142..7f31a6a11 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -644,16 +644,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   } else {
     add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_16_add sse2/;
+    specialize qw/vpx_idct4x4_16_add neon sse2/;
 
     add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct4x4_1_add neon sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
+    specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct8x8_1_add neon sse2/;
@@ -764,8 +764,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
-    # Need to add 34 eob idct32x32 neon implementation.
-    $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon;
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
diff --git a/vpxdec.c b/vpxdec.c
index 4bd16bbed..f1b09e657 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -781,7 +781,7 @@ static int main_loop(int argc, const char **argv_) {
           warn("Failed to decode frame %d: %s", frame_in,
                vpx_codec_error(&decoder));
           if (detail) warn("Additional information: %s", detail);
-          frames_corrupted++;
+          corrupted = 1;
           if (!keep_going) goto fail;
         }
 
@@ -800,7 +800,7 @@ static int main_loop(int argc, const char **argv_) {
       // Flush the decoder in frame parallel decode.
       if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
         warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
-        frames_corrupted++;
+        corrupted = 1;
         if (!keep_going) goto fail;
       }
     }
@@ -814,7 +814,7 @@ static int main_loop(int argc, const char **argv_) {
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (!frame_parallel &&
+    if (!frame_parallel && !corrupted &&
         vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       if (!keep_going) goto fail;