From 9fe510c12ab51c2fb5d4d916f5729e19b8f5ccf4 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 22 Jun 2017 18:36:08 -0700 Subject: partial fdct neon: add 32x32_1 Always return an int32_t. Since it needs to be moved to a register for shifting, this doesn't really penalize the smaller transforms. The values could potentially be summed and shifted in place. BUG=webm:1424 Change-Id: Id5beb35d79c7574ebd99285fc4182788cf2bb972 --- vpx_dsp/arm/fdct_partial_neon.c | 39 +++++++++++++++++++++++++++++++-------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 ++-- 2 files changed, 33 insertions(+), 10 deletions(-) (limited to 'vpx_dsp') diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 4e1a6dfda..3c4b292d7 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -14,16 +14,12 @@ #include "./vpx_config.h" #include "vpx_dsp/arm/mem_neon.h" -static INLINE tran_low_t sum_int16x8(const int16x8_t a) { +static INLINE int32_t sum_int16x8(const int16x8_t a) { const int32x4_t b = vpaddlq_s16(a); const int64x2_t c = vpaddlq_s32(b); const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), vreinterpret_s32_s64(vget_high_s64(c))); -#if CONFIG_VP9_HIGHBITDEPTH return vget_lane_s32(d, 0); -#else - return vget_lane_s16(vreinterpret_s16_s32(d), 0); -#endif } void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { @@ -44,7 +40,7 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { c = vaddq_s16(b0, b1); - output[0] = sum_int16x8(c) << 1; + output[0] = (tran_low_t)(sum_int16x8(c) << 1); output[1] = 0; } @@ -56,7 +52,7 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { sum = vaddq_s16(sum, input_00); } - output[0] = sum_int16x8(sum); + output[0] = (tran_low_t)sum_int16x8(sum); output[1] = 0; } @@ -74,6 +70,33 @@ void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, right = vaddq_s16(right, b); } - output[0] = (sum_int16x8(left) + sum_int16x8(right)) >> 1; + output[0] = (tran_low_t)((sum_int16x8(left) + sum_int16x8(right)) >> 1); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + // TODO(johannkoenig): sum and shift the values in neon registers. + output[0] = (tran_low_t)( + (sum_int16x8(a0) + sum_int16x8(a1) + sum_int16x8(a2) + sum_int16x8(a3)) >> + 3); output[1] = 0; } diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 734f6e1e2..bb83021d7 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -508,7 +508,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_rd sse2/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2/; + specialize qw/vpx_fdct32x32_1 sse2 neon/; add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_highbd_fdct4x4 sse2/; @@ -558,7 +558,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/; add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct32x32_1 sse2 msa/; + specialize qw/vpx_fdct32x32_1 sse2 neon msa/; } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_VP9_ENCODER -- cgit v1.2.3