diff options
-rw-r--r-- | vpx_dsp/arm/quantize_neon.c | 269 |
1 files changed, 130 insertions, 139 deletions
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index bd7818a07..dcdf588cb 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -17,20 +17,57 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH const int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); const int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); -#if CONFIG_VP9_HIGHBITDEPTH - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -41,106 +78,61 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; - (void)scan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(zbin_ptr); + int16x8_t round = vld1q_s16(round_ptr); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vld1q_s16(zbin_ptr); - const int16x8_t round = vld1q_s16(round_ptr); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } n_coeffs -= 8; { - const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]); - const int16x8_t round = vdupq_n_s16(round_ptr[1]); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { // Add one because the eob is not its index. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 - qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; - n_coeffs -= 8; } while (n_coeffs > 0); } @@ -156,6 +148,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)scan; } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -164,7 +159,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) { static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, const int16x8_t dequant, - tran_low_t *dqcoeff) { + tran_low_t *dqcoeff_ptr) { int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); int32x4_t dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); @@ -176,14 +171,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, #if CONFIG_VP9_HIGHBITDEPTH dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); - vst1q_s32(dqcoeff, dqcoeff_0); - vst1q_s32(dqcoeff + 4, dqcoeff_1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); #else - vst1q_s16(dqcoeff, + vst1q_s16(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); #endif // CONFIG_VP9_HIGHBITDEPTH } +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + // Main difference is that zbin values are halved before comparison and dqcoeff // values are divided by 2. zbin is rounded but dqcoeff is not. void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -198,103 +230,58 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; - (void)scan; - (void)n_coeffs; // Because we will always calculate 32*32. + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); + int16x8_t quant = vld1q_s16(quant_ptr); + int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); + int16x8_t dequant = vld1q_s16(dequant_ptr); // Process first 8 values which include a dc component. { - // Only the first element of each vector is DC. - const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1); - const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1); - const int16x8_t quant = vld1q_s16(quant_ptr); - const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr); - const int16x8_t dequant = vld1q_s16(dequant_ptr); // Add one because the eob does not index from 0. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } { - const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1); - const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1); - const int16x8_t quant = vdupq_n_s16(quant_ptr[1]); - const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]); - const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]); + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { // Add one because the eob is not its index. const uint16x8_t v_iscan = vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); - const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); - const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); - const int16x8_t coeff_abs = vabsq_s16(coeff); - - const int16x8_t zbin_mask = - vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); - - const int16x8_t rounded = vqaddq_s16(coeff_abs, round); - - // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 - int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); - - qcoeff = vaddq_s16(qcoeff, rounded); - - // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 - qcoeff = vqdmulhq_s16(qcoeff, quant_shift); - - // Restore the sign bit. - qcoeff = veorq_s16(qcoeff, coeff_sign); - qcoeff = vsubq_s16(qcoeff, coeff_sign); - - qcoeff = vandq_s16(qcoeff, zbin_mask); + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); // Set non-zero elements to -1 and use that to extract values for eob. eob_max = vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + __builtin_prefetch(coeff_ptr + 64); coeff_ptr += 8; iscan += 8; - - store_s16q_to_tran_low(qcoeff_ptr, qcoeff); qcoeff_ptr += 8; - - calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); dqcoeff_ptr += 8; } } @@ -310,4 +297,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vst1_lane_u16(eob_ptr, eob_max_2, 0); } #endif // __aarch64__ + // Need these here, else the compiler complains about mixing declarations and + // code in C90 + (void)n_coeffs; + (void)scan; } |