diff options
author | Johann <johann.koenig@duck.com> | 2018-11-12 11:30:03 -0800 |
---|---|---|
committer | Johann <johann.koenig@duck.com> | 2018-11-12 11:47:29 -0800 |
commit | 43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch) | |
tree | e320e59da20c1a3fecef24485de73cb9e03c980b /vp8/encoder/arm | |
parent | 4a8c248744500f9caf00588ca312efce5659e45e (diff) | |
download | libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2 libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip |
quantize: use aarch64 vmaxv
Simplify max value calculation on aarch64 by using vmaxv. Much
faster for 4x4 but diminishing returns as the block size grows.
Only the vp9 quantize has a speed test hooked up. Anticipate
similar results for the other quantize versions.
Before:
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[ BENCH ] Bypass calculations 4x4 31.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 4x4 31.6 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 8x8 17.7 ms ( ±0.0 ms )
[ BENCH ] Full calculations 8x8 17.7 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 16x16 14.2 ms ( ±0.0 ms )
[ BENCH ] Full calculations 16x16 14.2 ms ( ±0.0 ms )
[ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms)
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms )
After:
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2
[ BENCH ] Bypass calculations 4x4 29.1 ms ( ±0.0 ms )
[ BENCH ] Full calculations 4x4 29.1 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 8x8 16.9 ms ( ±0.0 ms )
[ BENCH ] Full calculations 8x8 16.9 ms ( ±0.0 ms )
[ BENCH ] Bypass calculations 16x16 14.1 ms ( ±0.0 ms )
[ BENCH ] Full calculations 16x16 14.1 ms ( ±0.0 ms )
[ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms)
[ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3
[ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms )
[ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms )
Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2
Diffstat (limited to 'vp8/encoder/arm')
-rw-r--r-- | vp8/encoder/arm/neon/fastquantizeb_neon.c | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index c42005df6..d066be1a7 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -26,9 +26,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; +#ifndef __aarch64__ uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; +#endif // __arch64__ /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -66,11 +68,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); +#ifdef __aarch64__ + *d->eob = (int8_t)vmaxvq_u16(eob0); +#else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); eob_q32 = vmovl_u16(eob_d16); eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); eob_d32 = vpmax_u32(eob_d32, eob_d32); + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +#endif // __aarch64__ + /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); vst1q_s16(d->qcoeff + 8, x1); @@ -78,6 +86,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* dqcoeff = x * dequant */ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); - - vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } |