diff options
-rw-r--r-- | vpx_dsp/arm/highbd_idct16x16_add_neon.c | 16 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_idct4x4_add_neon.c | 20 | ||||
-rw-r--r-- | vpx_dsp/arm/highbd_idct8x8_add_neon.c | 114 | ||||
-rw-r--r-- | vpx_dsp/arm/idct16x16_add_neon.c | 4 | ||||
-rw-r--r-- | vpx_dsp/arm/idct32x32_add_neon.c | 6 | ||||
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 93 |
6 files changed, 132 insertions, 121 deletions
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 8ab2960c0..0f96165e1 100644 --- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -19,14 +19,14 @@ static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t, int32x4x2_t *const d1) { int32x2x2_t t32[4]; - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], 14); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], 14); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], 14); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], 14); - t32[2].val[0] = vrshrn_n_s64(t[2].val[0], 14); - t32[2].val[1] = vrshrn_n_s64(t[2].val[1], 14); - t32[3].val[0] = vrshrn_n_s64(t[3].val[0], 14); - t32[3].val[1] = vrshrn_n_s64(t[3].val[1], 14); + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS); + t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS); + t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS); + t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS); d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]); diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 26fa3e216..128f72b9c 100644 --- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -82,10 +82,10 @@ static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); - b0 = vrshrq_n_s32(b0, 14); - b1 = vrshrq_n_s32(b1, 14); - b2 = vrshrq_n_s32(b2, 14); - b3 = vrshrq_n_s32(b3, 14); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); @@ -119,10 +119,14 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); - b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14)); - b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14)); - b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14)); - b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14)); + b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), + vrshrn_n_s64(c1, DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), + vrshrn_n_s64(c3, DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), + vrshrn_n_s64(c5, DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), + vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c index 141d2e68d..f53f4c7fc 100644 --- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -82,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10( step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); // stage 2 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); step2[4] = vaddq_s32(step1[4], step1[5]); step2[5] = vsubq_s32(step1[4], step1[5]); @@ -109,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10( step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); // stage 4 *io0 = vaddq_s32(step1[0], step2[7]); @@ -154,14 +154,14 @@ static INLINE void idct8x8_12_half1d_bd12( t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step1[4] = vcombine_s32(t32[0], t32[1]); step1[5] = vcombine_s32(t32[2], t32[3]); step1[6] = vcombine_s32(t32[4], t32[5]); @@ -174,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12( t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step2[1] = vcombine_s32(t32[2], t32[3]); step2[2] = vcombine_s32(t32[4], t32[5]); step2[3] = vcombine_s32(t32[6], t32[7]); @@ -205,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12( vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); step1[5] = vcombine_s32(t32[0], t32[1]); step1[6] = vcombine_s32(t32[2], t32[3]); @@ -377,10 +377,10 @@ static INLINE void idct8x8_64_half1d_bd10( step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); // stage 2 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); @@ -392,10 +392,10 @@ static INLINE void idct8x8_64_half1d_bd10( step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); - step2[0] = vrshrq_n_s32(step2[0], 14); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); step2[4] = vaddq_s32(step1[4], step1[5]); step2[5] = vsubq_s32(step1[4], step1[5]); @@ -411,8 +411,8 @@ static INLINE void idct8x8_64_half1d_bd10( step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); // stage 4 *io0 = vaddq_s32(step1[0], step2[7]); @@ -473,14 +473,14 @@ static INLINE void idct8x8_64_half1d_bd12( t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step1[4] = vcombine_s32(t32[0], t32[1]); step1[5] = vcombine_s32(t32[2], t32[3]); step1[6] = vcombine_s32(t32[4], t32[5]); @@ -501,14 +501,14 @@ static INLINE void idct8x8_64_half1d_bd12( t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step2[0] = vcombine_s32(t32[0], t32[1]); step2[1] = vcombine_s32(t32[2], t32[3]); step2[2] = vcombine_s32(t32[4], t32[5]); @@ -535,10 +535,10 @@ static INLINE void idct8x8_64_half1d_bd12( vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); step1[5] = vcombine_s32(t32[0], t32[1]); step1[6] = vcombine_s32(t32[2], t32[3]); diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index 728ebaeef..b2f516f41 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -16,8 +16,8 @@ static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, int16x4_t *const d1) { - *d0 = vrshrn_n_s32(t32[0], 14); - *d1 = vrshrn_n_s32(t32[1], 14); + *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); } static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c index de1bf9787..ae9457e18 100644 --- a/vpx_dsp/arm/idct32x32_add_neon.c +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -147,8 +147,10 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, q11s32 = vaddq_s32(q12s32, q11s32); q10s32 = vaddq_s32(q10s32, q15s32); - *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14)); + *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS), + vrshrn_n_s32(q9s32, DCT_CONST_BITS)); + *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS), + vrshrn_n_s32(q10s32, DCT_CONST_BITS)); } static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1, diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index 9f27e6404..7f7f2f133 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { @@ -93,21 +94,21 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { //------------------------------------------------------------------------------ -// Multiply a by a_const. Saturate, shift and narrow by 14. +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { - // Shift by 14 + rounding will be within 16 bits for well formed streams. - // See WRAPLOW and dct_const_round_shift for details. + // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed + // streams. See WRAPLOW and dct_const_round_shift for details. // This instruction doubles the result and returns the high half, essentially // resulting in a right shift by 15. By multiplying the constant first that - // becomes a right shift by 14. + // becomes a right shift by DCT_CONST_BITS. // The largest possible value used here is // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* // within the range of int16_t (+32767 / -32768) even when negated. return vqrdmulhq_n_s16(a, a_const * 2); } -// Add a and b, then multiply by ab_const. Shift and narrow by 14. +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t add_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { // In both add_ and it's pair, sub_, the input for well-formed streams will be @@ -121,21 +122,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16( int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); temp_low = vmulq_n_s32(temp_low, ab_const); temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } -// Subtract b from a, then multiply by ab_const. Shift and narrow by 14. +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); temp_low = vmulq_n_s32(temp_low, ab_const); temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by -// 14. +// DCT_CONST_BITS. static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( const int16x8_t a, const int16_t a_const, const int16x8_t b, const int16_t b_const) { @@ -143,7 +147,8 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } // Shift the output down by 6 and add it to the destination buffer. @@ -233,10 +238,10 @@ static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, c3 = vmull_lane_s16(b2, cospis, 1); c2 = vmlsl_lane_s16(c2, b3, cospis, 1); c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, 14); - b1 = vrshrn_n_s32(c1, 14); - b2 = vrshrn_n_s32(c2, 14); - b3 = vrshrn_n_s32(c3, 14); + b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); + b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); + b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); + b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); d0 = vcombine_s16(b0, b1); d1 = vcombine_s16(b3, b2); *a0 = vaddq_s16(d0, d1); @@ -278,8 +283,8 @@ static INLINE void idct8x8_12_pass1_bd8( t32[1] = vmull_lane_s16(step2[6], cospis0, 2); t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); - step1[5] = vrshrn_n_s32(t32[0], 14); - step1[6] = vrshrn_n_s32(t32[1], 14); + step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); // stage 4 *io0 = vadd_s16(step1[0], step2[7]); @@ -337,10 +342,10 @@ static INLINE void idct8x8_12_pass2_bd8( t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); step1[5] = vcombine_s16(t16[0], t16[1]); step1[6] = vcombine_s16(t16[2], t16[3]); @@ -405,14 +410,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); step1[4] = vcombine_s16(t16[0], t16[1]); step1[5] = vcombine_s16(t16[2], t16[3]); step1[6] = vcombine_s16(t16[4], t16[5]); @@ -433,14 +438,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); step2[0] = vcombine_s16(t16[0], t16[1]); step2[1] = vcombine_s16(t16[2], t16[3]); step2[2] = vcombine_s16(t16[4], t16[5]); @@ -463,10 +468,10 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); step1[5] = vcombine_s16(t16[0], t16[1]); step1[6] = vcombine_s16(t16[2], t16[3]); @@ -486,10 +491,10 @@ static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, int16x8_t *const d1) { int16x4_t t16[4]; - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); *d0 = vcombine_s16(t16[0], t16[1]); *d1 = vcombine_s16(t16[2], t16[3]); } |