diff options
author | Johann <johann.koenig@duck.com> | 2018-11-12 11:30:03 -0800 |
---|---|---|
committer | Johann <johann.koenig@duck.com> | 2018-11-12 11:47:29 -0800 |
commit | 43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch) | |
tree | e320e59da20c1a3fecef24485de73cb9e03c980b /vp9/encoder/arm/neon | |
parent | 4a8c248744500f9caf00588ca312efce5659e45e (diff) | |
download | libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2 libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip |
quantize: use aarch64 vmaxv
Simplify max value calculation on aarch64 by using vmaxv. Much
faster for 4x4 but diminishing returns as the block size grows.
Only the vp9 quantize has a speed test hooked up. Anticipate
similar results for the other quantize versions.
Before:
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[ BENCH ] Bypass calculations 4x4 31.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 4x4 31.6 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 8x8 17.7 ms ( ±0.0 ms )
[ BENCH ] Full calculations 8x8 17.7 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 16x16 14.2 ms ( ±0.0 ms )
[ BENCH ] Full calculations 16x16 14.2 ms ( ±0.0 ms )
[ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms)
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms )
After:
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[ BENCH ] Bypass calculations 4x4 29.1 ms ( ±0.0 ms )
[ BENCH ] Full calculations 4x4 29.1 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 8x8 16.9 ms ( ±0.0 ms )
[ BENCH ] Full calculations 8x8 16.9 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 16x16 14.1 ms ( ±0.0 ms )
[ BENCH ] Full calculations 16x16 14.1 ms ( ±0.0 ms )
[ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms)
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms )
Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2
Diffstat (limited to 'vp9/encoder/arm/neon')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_quantize_neon.c | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 2cec8bd03..8b62b450c 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else { const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); @@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -226,6 +230,9 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, dqcoeff_ptr += 8; } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } } |