5 files changed, 518 insertions, 5 deletions
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 9c409e9a7..8764a78de 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -765,6 +765,36 @@ INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
 
 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
 const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 8, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 10, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2),
   make_tuple(
       &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
       &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2),
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index ae98eb23d..705b3a610 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -246,6 +246,7 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
 DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 99ef262b1..1dc3b1155 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -656,9 +656,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
     specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
     specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
-    specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
-    specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
-    specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
+    specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
     specialize qw/vpx_highbd_idct32x32_1024_add neon/;
     specialize qw/vpx_highbd_idct32x32_135_add neon/;
     specialize qw/vpx_highbd_idct32x32_34_add neon/;
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 000000000..f25d8e5ee
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,471 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  __m128i temp1[2], temp2;
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  temp2 = _mm_sub_epi32(in[6], in[5]);
+  extend_64bit(temp2, temp1);
+  out[5] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  temp2 = _mm_add_epi32(in[6], in[5]);
+  extend_64bit(temp2, temp1);
+  out[6] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  __m128i temp1[2], temp2;
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  temp2 = _mm_sub_epi32(in[13], in[10]);
+  extend_64bit(temp2, temp1);
+  out[10] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  temp2 = _mm_add_epi32(in[13], in[10]);
+  extend_64bit(temp2, temp1);
+  out[13] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+
+  temp2 = _mm_sub_epi32(in[12], in[11]);
+  extend_64bit(temp2, temp1);
+  out[11] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  temp2 = _mm_add_epi32(in[12], in[11]);
+  extend_64bit(temp2, temp1);
+  out[12] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[4], temp2;
+
+  // stage 2
+  highbd_multiplication_and_add_sse4_1(io[1], io[15], (int)cospi_30_64,
+                                       (int)cospi_2_64, &step2[8], &step2[15]);
+  highbd_multiplication_and_add_sse4_1(io[9], io[7], (int)cospi_14_64,
+                                       (int)cospi_18_64, &step2[9], &step2[14]);
+  highbd_multiplication_and_add_sse4_1(io[5], io[11], (int)cospi_22_64,
+                                       (int)cospi_10_64, &step2[10],
+                                       &step2[13]);
+  highbd_multiplication_and_add_sse4_1(
+      io[13], io[3], (int)cospi_6_64, (int)cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  highbd_multiplication_and_add_sse4_1(io[2], io[14], (int)cospi_28_64,
+                                       (int)cospi_4_64, &step1[4], &step1[7]);
+  highbd_multiplication_and_add_sse4_1(io[10], io[6], (int)cospi_12_64,
+                                       (int)cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  temp2 = _mm_add_epi32(io[0], io[8]);
+  extend_64bit(temp2, temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  temp2 = _mm_sub_epi32(io[0], io[8]);
+  extend_64bit(temp2, temp1);
+  step2[1] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  highbd_multiplication_and_add_sse4_1(io[4], io[12], (int)cospi_24_64,
+                                       (int)cospi_8_64, &step2[2], &step2[3]);
+  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                                       (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                                       (int)cospi_24_64, &step2[13],
+                                       &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2];
+
+  // stage 2
+  highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,
+                               &step2[8], &step2[15]);
+  highbd_multiplication_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64,
+                               &step2[9], &step2[14]);
+  highbd_multiplication_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64,
+                               &step2[10], &step2[13]);
+  highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,
+                               &step2[11], &step2[12]);
+
+  // stage 3
+  highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,
+                               &step1[4], &step1[7]);
+  highbd_multiplication_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64,
+                               &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  extend_64bit(io[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);
+  step2[1] = step2[0];
+  highbd_multiplication_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64,
+                               &step2[2], &step2[3]);
+  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                                       (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                                       (int)cospi_24_64, &step2[13],
+                                       &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2];
+
+  // stage 2
+  highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,
+                               &step2[8], &step2[15]);
+  highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,
+                               &step2[11], &step2[12]);
+
+  // stage 3
+  highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,
+                               &step1[4], &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  extend_64bit(io[0], temp);
+  step2[0] = multiplication_round_shift_sse4_1(temp, (int)cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_multiplication_and_add_sse4_1(step1[14], step1[9], (int)cospi_24_64,
+                                       (int)cospi_8_64, &step2[9], &step2[14]);
+  highbd_multiplication_and_add_sse4_1(step1[10], step1[13], (int)cospi_8_64,
+                                       (int)cospi_24_64, &step2[13],
+                                       &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      in[0] = load_pack_8_32bit(input + 0 * 16);
+      in[1] = load_pack_8_32bit(input + 1 * 16);
+      in[2] = load_pack_8_32bit(input + 2 * 16);
+      in[3] = load_pack_8_32bit(input + 3 * 16);
+      in[4] = load_pack_8_32bit(input + 4 * 16);
+      in[5] = load_pack_8_32bit(input + 5 * 16);
+      in[6] = load_pack_8_32bit(input + 6 * 16);
+      in[7] = load_pack_8_32bit(input + 7 * 16);
+      transpose_16bit_8x8(in, in);
+
+      in[8] = load_pack_8_32bit(input + 0 * 16 + 8);
+      in[9] = load_pack_8_32bit(input + 1 * 16 + 8);
+      in[10] = load_pack_8_32bit(input + 2 * 16 + 8);
+      in[11] = load_pack_8_32bit(input + 3 * 16 + 8);
+      in[12] = load_pack_8_32bit(input + 4 * 16 + 8);
+      in[13] = load_pack_8_32bit(input + 5 * 16 + 8);
+      in[14] = load_pack_8_32bit(input + 6 * 16 + 8);
+      in[15] = load_pack_8_32bit(input + 7 * 16 + 8);
+      transpose_16bit_8x8(in + 8, in + 8);
+      idct16_8col(in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));
+      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));
+      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));
+      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));
+      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));
+      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));
+      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));
+      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));
+      transpose_32bit_8x4(in, in);
+
+      in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8));
+      in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12));
+      in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8));
+      in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12));
+      in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8));
+      in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12));
+      in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8));
+      in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12));
+      transpose_32bit_8x4(in + 8, in + 8);
+
+      highbd_idct16_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
+      transpose_32bit_8x4(out, out);
+
+      out[8] = all[2][i + 0];
+      out[9] = all[3][i + 0];
+      out[10] = all[2][i + 1];
+      out[11] = all[3][i + 1];
+      out[12] = all[2][i + 2];
+      out[13] = all[3][i + 2];
+      out[14] = all[2][i + 3];
+      out[15] = all[3][i + 3];
+      transpose_32bit_8x4(out + 8, out + 8);
+
+      highbd_idct16_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+    in[4] = load_pack_8_32bit(input + 4 * 16);
+    in[5] = load_pack_8_32bit(input + 5 * 16);
+    in[6] = load_pack_8_32bit(input + 6 * 16);
+    in[7] = load_pack_8_32bit(input + 7 * 16);
+    transpose_16bit_8x8(in, in);
+
+    in[8] = _mm_setzero_si128();
+    in[9] = _mm_setzero_si128();
+    in[10] = _mm_setzero_si128();
+    in[11] = _mm_setzero_si128();
+    in[12] = _mm_setzero_si128();
+    in[13] = _mm_setzero_si128();
+    in[14] = _mm_setzero_si128();
+    in[15] = _mm_setzero_si128();
+    idct16_8col(in);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(in + i, out);
+      out[8] = _mm_setzero_si128();
+      out[9] = _mm_setzero_si128();
+      out[10] = _mm_setzero_si128();
+      out[11] = _mm_setzero_si128();
+      out[12] = _mm_setzero_si128();
+      out[13] = _mm_setzero_si128();
+      out[14] = _mm_setzero_si128();
+      out[15] = _mm_setzero_si128();
+      idct16_8col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));
+      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));
+      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));
+      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));
+      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));
+      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));
+      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));
+      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));
+      transpose_32bit_8x4(in, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
+      transpose_32bit_8x4(out, out);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+      in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+      in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+      in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+      transpose_32bit_4x4(in, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index d19887d00..17b87a913 100644
--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -32,8 +32,8 @@ static INLINE __m128i multiplication_round_shift_sse4_1(
 static INLINE void highbd_multiplication_and_add_sse4_1(
     const __m128i in0, const __m128i in1, const int c0, const int c1,
     __m128i *const out0, __m128i *const out1) {
-  const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
-  const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
   __m128i temp1[4], temp2[4];
 
   extend_64bit(in0, temp1);
@@ -58,4 +58,15 @@ static INLINE void highbd_multiplication_and_add_sse4_1(
   *out1 = pack_4(temp2[0], temp2[1]);
 }
 
+static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,
+                                                const int c1,
+                                                __m128i *const out0,
+                                                __m128i *const out1) {
+  __m128i temp[2];
+
+  extend_64bit(in, temp);
+  *out0 = multiplication_round_shift_sse4_1(temp, c0);
+  *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_