7 files changed, 282 insertions, 63 deletions
diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
new file mode 100644
index 000000000..96625b98b
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -0,0 +1,177 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                   int dest_stride, int bd) {
+  int i;
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const tran_low_t out0 = dct_const_round_shift(input[0] * cospi_16_64);
+  const tran_low_t out1 = dct_const_round_shift(out0 * cospi_16_64);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int16x8_t a;
+  uint16x8_t b;
+  uint16x4_t d0, d1;
+
+  for (i = 0; i < 2; i++) {
+    d0 = vld1_u16(dest);
+    d1 = vld1_u16(dest + dest_stride);
+    a = vreinterpretq_s16_u16(vcombine_u16(d0, d1));
+    a = vaddq_s16(dc, a);
+    a = vminq_s16(a, max);
+    b = vqshluq_n_s16(a, 0);
+    vst1_u16(dest, vget_low_u16(b));
+    dest += dest_stride;
+    vst1_u16(dest, vget_high_u16(b));
+    dest += dest_stride;
+  }
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, 14);
+  b1 = vrshrq_n_s32(b1, 14);
+  b2 = vrshrq_n_s32(b2, 14);
+  b3 = vrshrq_n_s32(b3, 14);
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a0,
+                                          int32x4_t *const a1,
+                                          int32x4_t *const a2,
+                                          int32x4_t *const a3) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
+
+  transpose_s32_4x4(a0, a1, a2, a3);
+  b0 = vaddq_s32(*a0, *a2);
+  b1 = vsubq_s32(*a0, *a2);
+  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
+  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
+  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
+  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
+  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
+  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
+  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
+  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
+  c4 = vsubq_s64(c4, c8);
+  c5 = vsubq_s64(c5, c9);
+  c6 = vaddq_s64(c6, c10);
+  c7 = vaddq_s64(c7, c11);
+  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
+  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
+  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
+  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+  *a0 = vaddq_s32(b0, b3);
+  *a1 = vaddq_s32(b1, b2);
+  *a2 = vsubq_s32(b1, b2);
+  *a3 = vsubq_s32(b0, b3);
+}
+
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                    int dest_stride, int bd) {
+  DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,
+                                                             6270 };
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int32x4_t c0 = vld1q_s32(input);
+  int32x4_t c1 = vld1q_s32(input + 4);
+  int32x4_t c2 = vld1q_s32(input + 8);
+  int32x4_t c3 = vld1q_s32(input + 12);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const uint16_t *dst = dest;
+  int16x8_t a0, a1, d01, d32;
+  int16x4_t d0, d1, d2, d3;
+  uint16x8_t d01_u16, d32_u16;
+
+  if (bd == 8) {
+    const int16x4_t cospis = vld1_s16(kCospi);
+
+    // Rows
+    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
+    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+
+    // Columns
+    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a0 = vrshrq_n_s16(a0, 4);
+    a1 = vrshrq_n_s16(a1, 4);
+  } else {
+    const int32x4_t cospis = vld1q_s32(kCospi32);
+
+    if (bd == 10) {
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+    } else {
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+    }
+    // Note: In some profile tests, a0 and a1 are quite close to +/-32767.
+    // We use saturating narrow shift in case they could be even larger.
+    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
+    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+  }
+
+  d0 = vreinterpret_s16_u16(vld1_u16(dst));
+  dst += dest_stride;
+  d1 = vreinterpret_s16_u16(vld1_u16(dst));
+  dst += dest_stride;
+  d2 = vreinterpret_s16_u16(vld1_u16(dst));
+  dst += dest_stride;
+  d3 = vreinterpret_s16_u16(vld1_u16(dst));
+  d01 = vcombine_s16(d0, d1);
+  d32 = vcombine_s16(d3, d2);
+
+  // Note: In some profile tests, a0 and a1 is quite close to +/-32767.
+  // We use saturating addition.
+  d01 = vqaddq_s16(a0, d01);
+  d32 = vqaddq_s16(a1, d32);
+  d01 = vminq_s16(d01, max);
+  d32 = vminq_s16(d32, max);
+  d01_u16 = vqshluq_n_s16(d01, 0);
+  d32_u16 = vqshluq_n_s16(d32, 0);
+
+  vst1_u16(dest, vget_low_u16(d01_u16));
+  dest += dest_stride;
+  vst1_u16(dest, vget_high_u16(d01_u16));
+  dest += dest_stride;
+  vst1_u16(dest, vget_high_u16(d32_u16));
+  dest += dest_stride;
+  vst1_u16(dest, vget_low_u16(d32_u16));
+}
diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c
index 6ac516140..5ccc95ce0 100644
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -13,45 +13,12 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0,
-                                     int16x8_t *a1) {
-  int16x4_t b0, b1, b2, b3;
-  int32x4_t c0, c1, c2, c3;
-  int16x8_t d0, d1;
-
-  transpose_s16_4x4q(a0, a1);
-  b0 = vget_low_s16(*a0);
-  b1 = vget_high_s16(*a0);
-  b2 = vget_low_s16(*a1);
-  b3 = vget_high_s16(*a1);
-  c0 = vmull_lane_s16(b0, cospis, 2);
-  c2 = vmull_lane_s16(b1, cospis, 2);
-  c1 = vsubq_s32(c0, c2);
-  c0 = vaddq_s32(c0, c2);
-  c2 = vmull_lane_s16(b2, cospis, 3);
-  c3 = vmull_lane_s16(b2, cospis, 1);
-  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
-  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, 14);
-  b1 = vrshrn_n_s32(c1, 14);
-  b2 = vrshrn_n_s32(c2, 14);
-  b3 = vrshrn_n_s32(c3, 14);
-  d0 = vcombine_s16(b0, b1);
-  d1 = vcombine_s16(b3, b2);
-  *a0 = vaddq_s16(d0, d1);
-  *a1 = vsubq_s16(d0, d1);
-}
-
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                              int dest_stride) {
-  DECLARE_ALIGNED(16, static const int16_t, cospi[4]) = {
-    0, (int16_t)cospi_8_64, (int16_t)cospi_16_64, (int16_t)cospi_24_64
-  };
   const uint8_t *dst = dest;
-  const int16x4_t cospis = vld1_s16(cospi);
+  const int16x4_t cospis = vld1_s16(kCospi);
   uint32x2_t dest01_u32 = vdup_n_u32(0);
   uint32x2_t dest32_u32 = vdup_n_u32(0);
   int16x8_t a0, a1;
@@ -64,11 +31,11 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
   // Rows
   a0 = load_tran_low_to_s16q(input);
   a1 = load_tran_low_to_s16q(input + 8);
-  idct4x4_16_kernel(cospis, &a0, &a1);
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
 
   // Columns
   a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-  idct4x4_16_kernel(cospis, &a0, &a1);
+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
   a0 = vrshrq_n_s16(a0, 4);
   a1 = vrshrq_n_s16(a1, 4);
 
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index e4493a105..51eba062b 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -17,6 +17,9 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+DECLARE_ALIGNED(16, static const int16_t, kCospi[4]) = { 0, 15137, 11585,
+                                                         6270 };
+
 //------------------------------------------------------------------------------
 // Helper functions used to load tran_low_t into int16, narrowing if necessary.
 
@@ -180,4 +183,35 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
   b += b_stride;
   vst1_u8(b, b7);
 }
+
+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
+                                         int16x8_t *const a0,
+                                         int16x8_t *const a1) {
+  int16x4_t b0, b1, b2, b3;
+  int32x4_t c0, c1, c2, c3;
+  int16x8_t d0, d1;
+
+  transpose_s16_4x4q(a0, a1);
+  b0 = vget_low_s16(*a0);
+  b1 = vget_high_s16(*a0);
+  b2 = vget_low_s16(*a1);
+  b3 = vget_high_s16(*a1);
+  c0 = vmull_lane_s16(b0, cospis, 2);
+  c2 = vmull_lane_s16(b1, cospis, 2);
+  c1 = vsubq_s32(c0, c2);
+  c0 = vaddq_s32(c0, c2);
+  c2 = vmull_lane_s16(b2, cospis, 3);
+  c3 = vmull_lane_s16(b2, cospis, 1);
+  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
+  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
+  b0 = vrshrn_n_s32(c0, 14);
+  b1 = vrshrn_n_s32(c1, 14);
+  b2 = vrshrn_n_s32(c2, 14);
+  b3 = vrshrn_n_s32(c3, 14);
+  d0 = vcombine_s16(b0, b1);
+  d1 = vcombine_s16(b3, b2);
+  *a0 = vaddq_s16(d0, d1);
+  *a1 = vsubq_s16(d0, d1);
+}
+
 #endif  // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 29144a7e8..d0634fd0a 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -30,6 +30,13 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   return b0;
 }
 
+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
 static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
@@ -172,6 +179,37 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   *a1 = d0.val[1];
 }
 
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
                                      const int16x4_t a2, const int16x4_t a3,
                                      const int16x4_t a4, const int16x4_t a5,
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 200ef07f1..f04736ae0 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -226,6 +226,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+else  # CONFIG_VP9_HIGHBITDEPTH
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d78a35757..58969695b 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -618,6 +618,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_iwht4x4_16_add sse2/;
 
   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+  specialize qw/vpx_highbd_idct4x4_1_add neon/;
 
   add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 
@@ -709,7 +710,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct32x32_1_add neon sse2/;
 
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/vpx_highbd_idct4x4_16_add sse2/;
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
 
     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct8x8_64_add sse2/;
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index d5fc1440c..487a474a6 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -402,10 +402,10 @@ void iadst4_sse2(__m128i *in) {
       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
                                                                               \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
     }                                                                         \
                                                                               \
     /* Stage3 */                                                              \
@@ -413,10 +413,10 @@ void iadst4_sse2(__m128i *in) {
       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
                                                                               \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
                                                                               \
       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
@@ -438,14 +438,14 @@ void iadst4_sse2(__m128i *in) {
     }                                                                         \
                                                                               \
     /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
+    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
+    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
+    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
+    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
+    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
+    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
+    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
+    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
   }
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -866,8 +866,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
     stp2_0 = _mm_packs_epi32(tmp0, tmp2);
     stp2_2 = _mm_packs_epi32(tmp6, tmp4);
 
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
+    tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
 
     stp2_4 = tmp0;
     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
@@ -878,8 +878,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
   {
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
 
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
+    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
+    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
 
     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
@@ -896,10 +896,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 
   // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
+  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
 
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
@@ -3449,7 +3449,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
   __m128i ubounded, retval;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   ubounded = _mm_cmpgt_epi16(value, max);
   retval = _mm_andnot_si128(ubounded, value);
   ubounded = _mm_and_si128(ubounded, max);
@@ -4012,7 +4012,7 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
   __m128i dc_value, d;
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   int a, i, j;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
   tran_low_t out;