quantize: use aarch64 vmaxv

Simplify max value calculation on aarch64 by using vmaxv. Much faster for 4x4 but diminishing returns as the block size grows. Only the vp9 quantize has a speed test hooked up. Anticipate similar results for the other quantize versions. Before: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 31.6 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 17.7 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.2 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.2 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1906 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) After: [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/2 [ BENCH ] Bypass calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 4x4 29.1 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Full calculations 8x8 16.9 ms ( ±0.0 ms ) [ BENCH ] Bypass calculations 16x16 14.1 ms ( ±0.0 ms ) [ BENCH ] Full calculations 16x16 14.1 ms ( ±0.0 ms ) [ OK ] NEON/VP9QuantizeTest.DISABLED_Speed/2 (1803 ms) [ RUN ] NEON/VP9QuantizeTest.DISABLED_Speed/3 [ BENCH ] Bypass calculations 32x32 18.6 ms ( ±0.0 ms ) [ BENCH ] Full calculations 32x32 18.6 ms ( ±0.0 ms ) Change-Id: Ic95812b3fdbd4e47b4dcb8ed46c68a9617de38d2
author: Johann <johann.koenig@duck.com> 2018-11-12 11:30:03 -0800
committer: Johann <johann.koenig@duck.com> 2018-11-12 11:47:29 -0800
commit: 43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch)
tree: e320e59da20c1a3fecef24485de73cb9e03c980b /vp9/encoder/arm/neon
parent: 4a8c248744500f9caf00588ca312efce5659e45e (diff)
download: libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2
libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip
1 files changed, 8 insertions, 0 deletions
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 2cec8bd03..8b62b450c 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
     store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
   }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
   {
     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
                                              vget_high_s16(v_eobmax_76543210));
@@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
   }
+#endif  // __aarch64__
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -226,6 +230,9 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
       dqcoeff_ptr += 8;
     }
 
+#ifdef __aarch64__
+    *eob_ptr = vmaxvq_u16(eob_max);
+#else
     {
       const uint16x4_t eob_max_0 =
           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
       vst1_lane_u16(eob_ptr, eob_max_2, 0);
     }
+#endif  // __aarch64__
   }
 }
author	Johann <johann.koenig@duck.com>	2018-11-12 11:30:03 -0800
committer	Johann <johann.koenig@duck.com>	2018-11-12 11:47:29 -0800
commit	43a30d3a1a6b627fa05ba63f4c51414ced781ccb (patch)
tree	e320e59da20c1a3fecef24485de73cb9e03c980b /vp9/encoder/arm/neon
parent	4a8c248744500f9caf00588ca312efce5659e45e (diff)
download	libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.gz libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.tar.bz2 libvpx-43a30d3a1a6b627fa05ba63f4c51414ced781ccb.zip