diff options
author | Salome Thirot <salome.thirot@arm.com> | 2023-03-08 14:08:23 +0000 |
---|---|---|
committer | Salome Thirot <salome.thirot@arm.com> | 2023-03-14 22:43:04 +0000 |
commit | 362c69cfe565e68f240eb37ae05695c50b435656 (patch) | |
tree | e345dd991c54921dc7d211dcba09cbc1338c715c | |
parent | bbd6bc85a39a4957e895c18e92c67d5fb6a32452 (diff) | |
download | libvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar libvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar.gz libvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar.bz2 libvpx-362c69cfe565e68f240eb37ae05695c50b435656.zip |
Optimize vpx_minmax_8x8_neon for aarch64
Optimize vpx_minmax_8x8_neon on AArch64 targets by using the UMAXV and
UMINV instructions - computing the maximum and minimum elements in a
Neon vector.
Change-Id: I54c3a3a087d266f6774e6113e5947253df288a64
-rw-r--r-- | vpx_dsp/arm/avg_neon.c | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index 56d97e22a..d48115dd0 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -210,11 +210,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - // Split to D and start doing pairwise. +#if defined(__aarch64__) + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - // Enough runs of vpmax/min propogate the max/min values to every position. + // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); @@ -228,4 +233,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, // Store directly to avoid costly neon->gpr transfer. vst1_lane_u8((uint8_t *)max, ab_max, 0); vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif } |