diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/avg_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/fwd_txfm_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/hadamard_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct16x16_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct32x32_135_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct32x32_34_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct32x32_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct4x4_1_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct4x4_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct8x8_add_neon.c | 1 | ||||
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 52 | ||||
-rw-r--r-- | vpx_dsp/arm/mem_neon.h | 71 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 1 |
13 files changed, 82 insertions, 52 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index cca9a9324..257e8ffee 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -16,6 +16,7 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); diff --git a/vpx_dsp/arm/fwd_txfm_neon.c b/vpx_dsp/arm/fwd_txfm_neon.c index b26920504..c449b4660 100644 --- a/vpx_dsp/arm/fwd_txfm_neon.c +++ b/vpx_dsp/arm/fwd_txfm_neon.c @@ -14,6 +14,7 @@ #include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, int stride) { diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index ebeafed31..79bedd848 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index 828fb5f6c..5c5963d27 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c index b39825991..021211bc9 100644 --- a/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c index fc0c4cd84..f3c336fa3 100644 --- a/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c index 91418c9e6..9f4589ea9 100644 --- a/vpx_dsp/arm/idct32x32_add_neon.c +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c index d1eae24a2..21d21b033 100644 --- a/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -12,6 +12,7 @@ #include <assert.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/inv_txfm.h" static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index bff98cbc1..e44ba6e75 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index 279da67d7..1121ade27 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index 27c784edc..0fc1de8e4 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -41,58 +41,6 @@ DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = { }; //------------------------------------------------------------------------------ -// Helper functions used to load tran_low_t into int16, narrowing if necessary. - -static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4x2_t v0 = vld2q_s32(buf); - const int32x4x2_t v1 = vld2q_s32(buf + 8); - const int16x4_t s0 = vmovn_s32(v0.val[0]); - const int16x4_t s1 = vmovn_s32(v0.val[1]); - const int16x4_t s2 = vmovn_s32(v1.val[0]); - const int16x4_t s3 = vmovn_s32(v1.val[1]); - int16x8x2_t res; - res.val[0] = vcombine_s16(s0, s2); - res.val[1] = vcombine_s16(s1, s3); - return res; -#else - return vld2q_s16(buf); -#endif -} - -static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - const int32x4_t v1 = vld1q_s32(buf + 4); - const int16x4_t s0 = vmovn_s32(v0); - const int16x4_t s1 = vmovn_s32(v1); - return vcombine_s16(s0, s1); -#else - return vld1q_s16(buf); -#endif -} - -static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - return vmovn_s32(v0); -#else - return vld1_s16(buf); -#endif -} - -static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); - const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); - vst1q_s32(buf, v0); - vst1q_s32(buf + 4, v1); -#else - vst1q_s16(buf, a); -#endif -} - -//------------------------------------------------------------------------------ // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h new file mode 100644 index 000000000..ef6e9decd --- /dev/null +++ b/vpx_dsp/arm/mem_neon.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_DSP_ARM_MEM_NEON_H_ + +#include <arm_neon.h> +#include <assert.h> +#include <string.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} +#endif // VPX_DSP_ARM_MEM_NEON_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 8d1ecbe8c..afa14accc 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -352,6 +352,7 @@ endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h # PPC VSX utilities |