quantize: use aarch64 vmaxv

Simplify max value calculation on aarch64 by using vmaxv. Much faster for 4x4 but diminishing returns as the block size grows. Only the vp9 quantize has a speed test hooked up. Anticipate similar results for the other quantize versions. Before: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.2 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.2 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) After: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.1 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2
author: Johann <johann.koenig@duck.com> 2018-11-12 11:30:03 -0800
committer: Johann <johann.koenig@duck.com> 2018-11-12 11:47:29 -0800
commit: 43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch)
tree: e320e59da20c1a3fecef24485de73cb9e03c980b /vp8/encoder/arm
parent: 4a8c248744500f9caf00588ca312efce5659e45e (diff)
download: libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip
1 files changed, 8 insertions, 2 deletions
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index c42005df6..d066be1a7 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -26,9 +26,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
+#ifndef __aarch64__
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
+#endif  // __arch64__
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -66,11 +68,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
+#ifdef __aarch64__
+  *d->eob = (int8_t)vmaxvq_u16(eob0);
+#else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
   eob_q32 = vmovl_u16(eob_d16);
   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+#endif  // __aarch64__
+
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
   vst1q_s16(d->qcoeff + 8, x1);
@@ -78,6 +86,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   /* dqcoeff = x * dequant */
   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
-
-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
 }
author	Johann <johann.koenig@duck.com>	2018-11-12 11:30:03 -0800
committer	Johann <johann.koenig@duck.com>	2018-11-12 11:47:29 -0800
commit	43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch)
tree	e320e59da20c1a3fecef24485de73cb9e03c980b /vp8/encoder/arm
parent	4a8c248744500f9caf00588ca312efce5659e45e (diff)
download	libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2 libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip