From f310ddc4704e8b1cd5ec72472495ee8c3b13a486 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 22 Jun 2017 18:22:27 -0700 Subject: partial fdct neon: add 16x16_1 For the 8x8_1, the highbd output fit nicely in the existing function. 12 bit input will overflow this implementation of 16x16_1. BUG=webm:1424 Change-Id: I2945fe5478b18f996f1a5de80110fa30f3f4e7ec --- vpx_dsp/arm/fdct_partial_neon.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'vpx_dsp/arm') diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 945b96a21..4e1a6dfda 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -59,3 +59,21 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[0] = sum_int16x8(sum); output[1] = 0; } + +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t left = vld1q_s16(input); + int16x8_t right = vld1q_s16(input + 8); + input += stride; + for (r = 1; r < 16; ++r) { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + left = vaddq_s16(left, a); + right = vaddq_s16(right, b); + } + + output[0] = (sum_int16x8(left) + sum_int16x8(right)) >> 1; + output[1] = 0; +} -- cgit v1.2.3