14 files changed, 898 insertions, 59 deletions
diff --git a/test/dct_test.cc b/test/dct_test.cc
index a5ac9a0dc..379fbecc0 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -597,7 +597,9 @@ class TransHT : public TransTestBase {
   TransHT() { fwd_txfm_ref = fht_ref; }
 };
 
-TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(TransHT, AccuracyCheck) {
+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1);
+}
 
 TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
 
@@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2),
-   make_tuple(&vp9_highbd_fht16x16_c,
-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2),
-  */
-
 const DctParam c_ht_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vp9_highbd_fht16x16_c,
@@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = {
   make_tuple(&vp9_highbd_fht16x16_c,
              &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
              VPX_BITS_10, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2,
+             VPX_BITS_12, 2),
+  make_tuple(&vp9_highbd_fht16x16_c,
+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,
+             VPX_BITS_12, 2),
+
   make_tuple(&vp9_highbd_fht8x8_c,
              &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8,
              2),
@@ -784,6 +788,80 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, TransHT,
     ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   0, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   1, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   2, VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht16x16_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,
+                   3, VPX_BITS_12, 2),
+
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_8, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_10, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2,
+                   VPX_BITS_12, 2),
+        make_tuple(&vp9_highbd_fht8x8_c,
+                   &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3,
+                   VPX_BITS_12, 2),
+
         make_tuple(&vp9_highbd_fht4x4_c,
                    &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>, 4, 0,
                    VPX_BITS_8, 2),
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 164db5a7b..13de53464 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -106,4 +106,76 @@ TEST(EncodeAPI, ImageSizeSetting) {
 }
 #endif
 
+#if CONFIG_MULTI_RES_ENCODING
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers.
+TEST(EncodeAPI, VP8MultiResEncode) {
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+  vpx_codec_ctx_t enc[2];
+  vpx_codec_enc_cfg_t cfg[2];
+  vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+  memset(enc, 0, sizeof(enc));
+
+  for (int i = 0; i < 2; i++) {
+    vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg[i], 0);
+  }
+
+  /* Highest-resolution encoder settings */
+  cfg[0].g_w = width;
+  cfg[0].g_h = height;
+  cfg[0].rc_dropframe_thresh = 0;
+  cfg[0].rc_end_usage = VPX_CBR;
+  cfg[0].rc_resize_allowed = 0;
+  cfg[0].rc_min_quantizer = 2;
+  cfg[0].rc_max_quantizer = 56;
+  cfg[0].rc_undershoot_pct = 100;
+  cfg[0].rc_overshoot_pct = 15;
+  cfg[0].rc_buf_initial_sz = 500;
+  cfg[0].rc_buf_optimal_sz = 600;
+  cfg[0].rc_buf_sz = 1000;
+  cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+  cfg[0].g_lag_in_frames = 0;
+
+  cfg[0].kf_mode = VPX_KF_AUTO;
+  cfg[0].kf_min_dist = 3000;
+  cfg[0].kf_max_dist = 3000;
+
+  cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+  cfg[0].g_timebase.num = 1;                 /* Set fps */
+  cfg[0].g_timebase.den = framerate;
+
+  memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+  cfg[1].rc_target_bitrate = 500;
+  cfg[1].g_w = width_down;
+  cfg[1].g_h = height_down;
+
+  for (int i = 0; i < 2; i++) {
+    cfg[i].ts_number_layers = 2;
+    cfg[i].ts_periodicity = 2;
+    cfg[i].ts_rate_decimator[0] = 2;
+    cfg[i].ts_rate_decimator[1] = 1;
+    cfg[i].ts_layer_id[0] = 0;
+    cfg[i].ts_layer_id[1] = 1;
+    // Invalid parameters.
+    cfg[i].ts_target_bitrate[0] = 0;
+    cfg[i].ts_target_bitrate[1] = 0;
+  }
+
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_enc_init_multi(&enc[0], vpx_codec_vp8_cx(), &cfg[0], 2, 0,
+                                     &dsf[0]));
+
+  for (int i = 0; i < 2; i++) {
+    vpx_codec_destroy(&enc[i]);
+  }
+}
+#endif
+
 }  // namespace
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index dd6120266..7ee7ddaee 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
   add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
-  }
 
   add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
+  }
 }
 
 #
diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 000000000..57b79a732
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+                                s15);
+
+  x0[0] = _mm_add_epi64(s0[0], s8[0]);
+  x0[1] = _mm_add_epi64(s0[1], s8[1]);
+  x1[0] = _mm_add_epi64(s1[0], s9[0]);
+  x1[1] = _mm_add_epi64(s1[1], s9[1]);
+  x2[0] = _mm_add_epi64(s2[0], s10[0]);
+  x2[1] = _mm_add_epi64(s2[1], s10[1]);
+  x3[0] = _mm_add_epi64(s3[0], s11[0]);
+  x3[1] = _mm_add_epi64(s3[1], s11[1]);
+  x4[0] = _mm_add_epi64(s4[0], s12[0]);
+  x4[1] = _mm_add_epi64(s4[1], s12[1]);
+  x5[0] = _mm_add_epi64(s5[0], s13[0]);
+  x5[1] = _mm_add_epi64(s5[1], s13[1]);
+  x6[0] = _mm_add_epi64(s6[0], s14[0]);
+  x6[1] = _mm_add_epi64(s6[1], s14[1]);
+  x7[0] = _mm_add_epi64(s7[0], s15[0]);
+  x7[1] = _mm_add_epi64(s7[1], s15[1]);
+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x0[0] = pack_4(x0[0], x0[1]);
+  x1[0] = pack_4(x1[0], x1[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 2
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  s4[0] = x4[0];
+  s5[0] = x5[0];
+  s6[0] = x6[0];
+  s7[0] = x7[0];
+  x0[0] = _mm_add_epi32(s0[0], s4[0]);
+  x1[0] = _mm_add_epi32(s1[0], s5[0]);
+  x2[0] = _mm_add_epi32(s2[0], s6[0]);
+  x3[0] = _mm_add_epi32(s3[0], s7[0]);
+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+                                s12);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+                                s14);
+
+  x8[0] = _mm_add_epi64(s8[0], s12[0]);
+  x8[1] = _mm_add_epi64(s8[1], s12[1]);
+  x9[0] = _mm_add_epi64(s9[0], s13[0]);
+  x9[1] = _mm_add_epi64(s9[1], s13[1]);
+  x10[0] = _mm_add_epi64(s10[0], s14[0]);
+  x10[1] = _mm_add_epi64(s10[1], s14[1]);
+  x11[0] = _mm_add_epi64(s11[0], s15[0]);
+  x11[1] = _mm_add_epi64(s11[1], s15[1]);
+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 3
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+  s8[0] = x8[0];
+  s9[0] = x9[0];
+  s10[0] = x10[0];
+  s11[0] = x11[0];
+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+                                s14);
+
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = _mm_add_epi32(s8[0], s10[0]);
+  x9[0] = _mm_add_epi32(s9[0], s11[0]);
+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+  x12[0] = _mm_add_epi64(s12[0], s14[0]);
+  x12[1] = _mm_add_epi64(s12[1], s14[1]);
+  x13[0] = _mm_add_epi64(s13[0], s15[0]);
+  x13[1] = _mm_add_epi64(s13[1], s15[1]);
+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 4
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x7[0], x6[0]);
+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+  s10[0] = _mm_add_epi32(x11[0], x10[0]);
+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+  s14[0] = _mm_add_epi32(x14[0], x15[0]);
+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x10[0] = dct_const_round_shift_64bit(s10[0]);
+  x10[1] = dct_const_round_shift_64bit(s10[1]);
+  x11[0] = dct_const_round_shift_64bit(s11[0]);
+  x11[1] = dct_const_round_shift_64bit(s11[1]);
+  x14[0] = dct_const_round_shift_64bit(s14[0]);
+  x14[1] = dct_const_round_shift_64bit(s14[1]);
+  x15[0] = dct_const_round_shift_64bit(s15[0]);
+  x15[1] = dct_const_round_shift_64bit(s15[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+  io[2] = x12[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[4] = x6[0];
+  io[5] = x14[0];
+  io[6] = x10[0];
+  io[7] = x2[0];
+  io[8] = x3[0];
+  io[9] = x11[0];
+  io[10] = x15[0];
+  io[11] = x7[0];
+  io[12] = x5[0];
+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+  io[14] = x9[0];
+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int tx_type, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        idct16_8col(in, in);
+      } else {
+        vpx_iadst16_8col_sse2(in);
+      }
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        idct16_8col(out, out);
+      } else {
+        vpx_iadst16_8col_sse2(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        vpx_highbd_idct16_4col_sse4_1(in);
+      } else {
+        highbd_iadst16_4col_sse4_1(in);
+      }
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        vpx_highbd_idct16_4col_sse4_1(out);
+      } else {
+        highbd_iadst16_4col_sse4_1(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 000000000..7d949b6db
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+  x0[0] = _mm_add_epi64(s0[0], s4[0]);
+  x0[1] = _mm_add_epi64(s0[1], s4[1]);
+  x1[0] = _mm_add_epi64(s1[0], s5[0]);
+  x1[1] = _mm_add_epi64(s1[1], s5[1]);
+  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+  x2[0] = _mm_add_epi64(s2[0], s6[0]);
+  x2[1] = _mm_add_epi64(s2[1], s6[1]);
+  x3[0] = _mm_add_epi64(s3[0], s7[0]);
+  x3[1] = _mm_add_epi64(s3[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
+  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
+  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
+  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 2
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 3
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x6[0], x7[0]);
+  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[2] = x6[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+  io[4] = x3[0];
+  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+  io[6] = x5[0];
+  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+    highbd_idct8x8_final_round(io);
+  }
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 6996260e2..95dad919e 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -68,16 +68,16 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 
   switch (tx_type) {
     case 0:  // DCT_DCT
-      idct8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
     case 1:  // ADST_DCT
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
       iadst8_sse2(in);
       break;
     case 2:  // DCT_ADST
       iadst8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
     case 3:  // ADST_ADST
       iadst8_sse2(in);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9819fb641..377411431 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -82,6 +82,8 @@ VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
 else
 VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
 endif
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 4390cf7c8..a26204bc4 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */
+#include <assert.h>
 #include <limits.h>
+#include <stdlib.h>
 #include <string.h>
+#include "vp8/common/blockd.h"
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
@@ -89,28 +92,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
         if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
             dsf->den > dsf->num) {
           res = VPX_CODEC_INVALID_PARAM;
-          break;
+        } else {
+          mr_cfg.mr_low_res_mode_info = mem_loc;
+          mr_cfg.mr_total_resolutions = num_enc;
+          mr_cfg.mr_encoder_id = num_enc - 1 - i;
+          mr_cfg.mr_down_sampling_factor.num = dsf->num;
+          mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+          /* Force Key-frame synchronization. Namely, encoder at higher
+           * resolution always use the same frame_type chosen by the
+           * lowest-resolution encoder.
+           */
+          if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+          ctx->iface = iface;
+          ctx->name = iface->name;
+          ctx->priv = NULL;
+          ctx->init_flags = flags;
+          ctx->config.enc = cfg;
+          res = ctx->iface->init(ctx, &mr_cfg);
         }
 
-        mr_cfg.mr_low_res_mode_info = mem_loc;
-        mr_cfg.mr_total_resolutions = num_enc;
-        mr_cfg.mr_encoder_id = num_enc - 1 - i;
-        mr_cfg.mr_down_sampling_factor.num = dsf->num;
-        mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-        /* Force Key-frame synchronization. Namely, encoder at higher
-         * resolution always use the same frame_type chosen by the
-         * lowest-resolution encoder.
-         */
-        if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
-        ctx->iface = iface;
-        ctx->name = iface->name;
-        ctx->priv = NULL;
-        ctx->init_flags = flags;
-        ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx, &mr_cfg);
-
         if (res) {
           const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
           /* Destroy current ctx */
@@ -124,10 +126,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             vpx_codec_destroy(ctx);
             i--;
           }
+#if CONFIG_MULTI_RES_ENCODING
+          assert(mem_loc);
+          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+          free(mem_loc);
+#endif
+          return SAVE_STATUS(ctx, res);
         }
 
-        if (res) break;
-
         ctx++;
         cfg++;
         dsf++;
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
index de097c66a..7898ee12c 100644
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
   out[15] = in[15];
 }
 
-static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
 
   // stage 2
@@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       in = all[i];
       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
-      highbd_idct16_4col(in);
+      vpx_highbd_idct16_4col_sse4_1(in);
       input += 4 * 16;
     }
 
@@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       transpose_32bit_4x4(all[1] + i, out + 4);
       transpose_32bit_4x4(all[2] + i, out + 8);
       transpose_32bit_4x4(all[3] + i, out + 12);
-      highbd_idct16_4col(out);
+      vpx_highbd_idct16_4col_sse4_1(out);
 
       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 909a6b794..bb7a510e1 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
index ae391b2c0..8b2e3d241 100644
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -17,7 +17,7 @@
 #include "vpx_dsp/x86/inv_txfm_ssse3.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static void highbd_idct8x8_half1d(__m128i *const io) {
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
   __m128i step1[8], step2[8];
 
   transpose_32bit_4x4x2(io, io);
@@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
 
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
     io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
@@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
     io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
     io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     temp[0] = io[4];
     temp[1] = io[5];
@@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[5] = io[9];
     io[6] = io[10];
     io[7] = io[11];
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = temp[0];
     io[9] = temp[1];
     io[10] = temp[2];
     io[11] = temp[3];
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     highbd_idct8x8_final_round(io);
   }
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index 435934f1b..5a7fd1d39 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -106,4 +106,7 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
   io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
 }
 
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 6b1837df5..4b02da966 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -165,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    idct8_sse2(in);
+    vpx_idct8_sse2(in);
   }
 
   write_buffer_8x8(in, dest, stride);
@@ -221,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   recon_and_store_8_dual(dest, dc_value, stride);
 }
 
-void idct8_sse2(__m128i *const in) {
+void vpx_idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
   transpose_16bit_8x8(in, in);
 
@@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-static void iadst16_8col(__m128i *const in) {
+void vpx_iadst16_8col_sse2(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) {
 
 void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
   transpose_16bit_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
+  vpx_iadst16_8col_sse2(in0);
+  vpx_iadst16_8col_sse2(in1);
 }
 
 // Group the coefficient calculation into smaller functions to prevent stack
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index 5cd5098f1..d573f66c9 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -697,10 +697,11 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
 }
 
 void idct4_sse2(__m128i *const in);
-void idct8_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
 void idct16_sse2(__m128i *const in0, __m128i *const in1);
 void iadst4_sse2(__m128i *const in);
 void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
 void iadst16_sse2(__m128i *const in0, __m128i *const in1);
 void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
 void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);