summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm/idct_neon.h
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp/arm/idct_neon.h')
-rw-r--r--vpx_dsp/arm/idct_neon.h34
1 files changed, 34 insertions, 0 deletions
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index e4493a105..51eba062b 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -17,6 +17,9 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
+DECLARE_ALIGNED(16, static const int16_t, kCospi[4]) = { 0, 15137, 11585,
+ 6270 };
+
//------------------------------------------------------------------------------
// Helper functions used to load tran_low_t into int16, narrowing if necessary.
@@ -180,4 +183,35 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
b += b_stride;
vst1_u8(b, b7);
}
+
+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
+ int16x8_t *const a0,
+ int16x8_t *const a1) {
+ int16x4_t b0, b1, b2, b3;
+ int32x4_t c0, c1, c2, c3;
+ int16x8_t d0, d1;
+
+ transpose_s16_4x4q(a0, a1);
+ b0 = vget_low_s16(*a0);
+ b1 = vget_high_s16(*a0);
+ b2 = vget_low_s16(*a1);
+ b3 = vget_high_s16(*a1);
+ c0 = vmull_lane_s16(b0, cospis, 2);
+ c2 = vmull_lane_s16(b1, cospis, 2);
+ c1 = vsubq_s32(c0, c2);
+ c0 = vaddq_s32(c0, c2);
+ c2 = vmull_lane_s16(b2, cospis, 3);
+ c3 = vmull_lane_s16(b2, cospis, 1);
+ c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
+ c3 = vmlal_lane_s16(c3, b3, cospis, 3);
+ b0 = vrshrn_n_s32(c0, 14);
+ b1 = vrshrn_n_s32(c1, 14);
+ b2 = vrshrn_n_s32(c2, 14);
+ b3 = vrshrn_n_s32(c3, 14);
+ d0 = vcombine_s16(b0, b1);
+ d1 = vcombine_s16(b3, b2);
+ *a0 = vaddq_s16(d0, d1);
+ *a1 = vsubq_s16(d0, d1);
+}
+
#endif // VPX_DSP_ARM_IDCT_NEON_H_