diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/idct8x8_add_neon.c | 26 | ||||
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 39 |
2 files changed, 35 insertions, 30 deletions
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index 42e798ea4..7471387e4 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -17,28 +17,6 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, - const int stride) { - const uint8x8_t s = vld1_u8(*dest); - const int16x8_t res = vrshrq_n_s16(a, 5); - const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); - const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); - vst1_u8(*dest, d); - *dest += stride; -} - -static INLINE void add8x8(int16x8_t *const out, uint8_t *dest, - const int stride) { - idct8x8_add8x1(out[0], &dest, stride); - idct8x8_add8x1(out[1], &dest, stride); - idct8x8_add8x1(out[2], &dest, stride); - idct8x8_add8x1(out[3], &dest, stride); - idct8x8_add8x1(out[4], &dest, stride); - idct8x8_add8x1(out[5], &dest, stride); - idct8x8_add8x1(out[6], &dest, stride); - idct8x8_add8x1(out[7], &dest, stride); -} - void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16x8_t cospis = vld1q_s16(kCospi); @@ -57,7 +35,7 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, idct8x8_64_1d_bd8(cospis0, cospis1, a); idct8x8_64_1d_bd8(cospis0, cospis1, a); - add8x8(a, dest, stride); + idct8x8_add8x8_neon(a, dest, stride); } void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, @@ -77,5 +55,5 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a); idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b); - add8x8(b, dest, stride); + idct8x8_add8x8_neon(b, dest, stride); } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index 4e940bd21..c4d3b4711 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -417,18 +417,15 @@ static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0, output[7] = vsubq_s16(step1[0], step2[7]); } -static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, - const int16x4_t cospis1, - int16x8_t *const io) { +static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, input7h; int16x4_t step1l[4], step1h[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], - &io[7]); - // stage 1 input1l = vget_low_s16(io[1]); input1h = vget_high_s16(io[1]); @@ -514,6 +511,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, io[7] = vsubq_s16(step1[0], step2[7]); } +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], + &io[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io); +} + static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, const int16x8_t s1, const int16x4_t cospi_0_8_16_24, @@ -736,6 +741,28 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out, vst1q_s16(output, out[15]); } +static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 5); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest, + const int stride) { + idct8x8_add8x1(out[0], &dest, stride); + idct8x8_add8x1(out[1], &dest, stride); + idct8x8_add8x1(out[2], &dest, stride); + idct8x8_add8x1(out[3], &dest, stride); + idct8x8_add8x1(out[4], &dest, stride); + idct8x8_add8x1(out[5], &dest, stride); + idct8x8_add8x1(out[6], &dest, stride); + idct8x8_add8x1(out[7], &dest, stride); +} + static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest, const int stride) { const uint8x8_t s = vld1_u8(*dest); |