diff options
Diffstat (limited to 'vpx_dsp/arm/idct_neon.h')
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index e4493a105..51eba062b 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -17,6 +17,9 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/vpx_dsp_common.h" +DECLARE_ALIGNED(16, static const int16_t, kCospi[4]) = { 0, 15137, 11585, + 6270 }; + //------------------------------------------------------------------------------ // Helper functions used to load tran_low_t into int16, narrowing if necessary. @@ -180,4 +183,35 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, b += b_stride; vst1_u8(b, b7); } + +static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, + int16x8_t *const a0, + int16x8_t *const a1) { + int16x4_t b0, b1, b2, b3; + int32x4_t c0, c1, c2, c3; + int16x8_t d0, d1; + + transpose_s16_4x4q(a0, a1); + b0 = vget_low_s16(*a0); + b1 = vget_high_s16(*a0); + b2 = vget_low_s16(*a1); + b3 = vget_high_s16(*a1); + c0 = vmull_lane_s16(b0, cospis, 2); + c2 = vmull_lane_s16(b1, cospis, 2); + c1 = vsubq_s32(c0, c2); + c0 = vaddq_s32(c0, c2); + c2 = vmull_lane_s16(b2, cospis, 3); + c3 = vmull_lane_s16(b2, cospis, 1); + c2 = vmlsl_lane_s16(c2, b3, cospis, 1); + c3 = vmlal_lane_s16(c3, b3, cospis, 3); + b0 = vrshrn_n_s32(c0, 14); + b1 = vrshrn_n_s32(c1, 14); + b2 = vrshrn_n_s32(c2, 14); + b3 = vrshrn_n_s32(c3, 14); + d0 = vcombine_s16(b0, b1); + d1 = vcombine_s16(b3, b2); + *a0 = vaddq_s16(d0, d1); + *a1 = vsubq_s16(d0, d1); +} + #endif // VPX_DSP_ARM_IDCT_NEON_H_ |