summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_idct16x16_add_neon.c73
-rw-r--r--vpx_dsp/arm/highbd_idct32x32_add_neon.c89
-rw-r--r--vpx_dsp/arm/highbd_idct8x8_add_neon.c50
-rw-r--r--vpx_dsp/vpx_dsp.mk2
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl3
5 files changed, 201 insertions, 16 deletions
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 000000000..bcf7de633
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ vst1q_u16(*dest, c0);
+ vst1q_u16(*dest + 8, c1);
+ *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 000000000..d74331f80
--- /dev/null
+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ const int16x8_t c2 = vminq_s16(b2, max);
+ const int16x8_t c3 = vminq_s16(b3, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+ vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+ const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+ vst1q_u16(*dest, c0);
+ vst1q_u16(*dest + 8, c1);
+ vst1q_u16(*dest + 16, c2);
+ vst1q_u16(*dest + 24, c3);
+ *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index c1c0f645d..141d2e68d 100644
--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -15,21 +15,29 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/inv_txfm.h"
-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
- const int stride,
- const int16x8_t res,
- const int16x8_t max) {
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
const uint16x8_t a = vld1q_u16(*dest);
const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
const int16x8_t c = vminq_s16(b, max);
- const uint16x8_t d = vqshluq_n_s16(c, 0);
- vst1q_u16(*dest, d);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const uint16x8_t c = vqshluq_n_s16(b, 0);
+ vst1q_u16(*dest, c);
*dest += stride;
}
void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
- const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
@@ -38,14 +46,26 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
const int16x8_t dc = vdupq_n_s16(a1);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ } else {
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
}
static INLINE void idct8x8_12_half1d_bd10(
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a0f7d75aa..33c9e51e8 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -224,6 +224,8 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
else # CONFIG_VP9_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 07cdee31e..2e3c78215 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -624,13 +624,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_idct8x8_1_add neon/;
add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct16x16_1_add neon/;
add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct32x32_1_add sse2/;
+ specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";