5 files changed, 201 insertions, 16 deletions
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 000000000..bcf7de633
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 4; ++i) {
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 000000000..d74331f80
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res,
+                                                     const int16x8_t max) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const int16x8_t c0 = vminq_s16(b0, max);
+  const int16x8_t c1 = vminq_s16(b1, max);
+  const int16x8_t c2 = vminq_s16(b2, max);
+  const int16x8_t c3 = vminq_s16(b3, max);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+  vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+  vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+  vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+                                                     const int stride,
+                                                     const int16x8_t res) {
+  const uint16x8_t a0 = vld1q_u16(*dest);
+  const uint16x8_t a1 = vld1q_u16(*dest + 8);
+  const uint16x8_t a2 = vld1q_u16(*dest + 16);
+  const uint16x8_t a3 = vld1q_u16(*dest + 24);
+  const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+  const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+  const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+  const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+  const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+  const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+  const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+  const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+  vst1q_u16(*dest, c0);
+  vst1q_u16(*dest + 8, c1);
+  vst1q_u16(*dest + 16, c2);
+  vst1q_u16(*dest + 24, c3);
+  *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int bd) {
+  const tran_low_t out0 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  const tran_low_t out1 =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+  const int16x8_t dc = vdupq_n_s16(a1);
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int i;
+
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+      highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+    }
+  } else {
+    for (i = 0; i < 8; ++i) {
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+      highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+    }
+  }
+}
diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index c1c0f645d..141d2e68d 100644
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -15,21 +15,29 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
-                                               const int stride,
-                                               const int16x8_t res,
-                                               const int16x8_t max) {
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res,
+                                                   const int16x8_t max) {
   const uint16x8_t a = vld1q_u16(*dest);
   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
   const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1q_u16(*dest, d);
+  vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+  *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+                                                   const int stride,
+                                                   const int16x8_t res) {
+  const uint16x8_t a = vld1q_u16(*dest);
+  const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+  const uint16x8_t c = vqshluq_n_s16(b, 0);
+  vst1q_u16(*dest, c);
   *dest += stride;
 }
 
 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
   const tran_low_t out0 =
       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
   const tran_low_t out1 =
@@ -38,14 +46,26 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
   const int16x8_t dc = vdupq_n_s16(a1);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
-  highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+  if (a1 >= 0) {
+    const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+    highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+  } else {
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+    highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+  }
 }
 
 static INLINE void idct8x8_12_half1d_bd10(
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a0f7d75aa..33c9e51e8 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -224,6 +224,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 07cdee31e..2e3c78215 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -624,13 +624,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_idct8x8_1_add neon/;
 
   add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+  specialize qw/vpx_highbd_idct16x16_1_add neon/;
 
   add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
   add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
 
   add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
-  specialize qw/vpx_highbd_idct32x32_1_add sse2/;
+  specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
 
   add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";