summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSalome Thirot <salome.thirot@arm.com>2023-03-08 14:08:23 +0000
committerSalome Thirot <salome.thirot@arm.com>2023-03-14 22:43:04 +0000
commit362c69cfe565e68f240eb37ae05695c50b435656 (patch)
treee345dd991c54921dc7d211dcba09cbc1338c715c
parentbbd6bc85a39a4957e895c18e92c67d5fb6a32452 (diff)
downloadlibvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar
libvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar.gz
libvpx-362c69cfe565e68f240eb37ae05695c50b435656.tar.bz2
libvpx-362c69cfe565e68f240eb37ae05695c50b435656.zip
Optimize vpx_minmax_8x8_neon for aarch64
Optimize vpx_minmax_8x8_neon on AArch64 targets by using the UMAXV and UMINV instructions - computing the maximum and minimum elements in a Neon vector. Change-Id: I54c3a3a087d266f6774e6113e5947253df288a64
-rw-r--r--vpx_dsp/arm/avg_neon.c10
1 files changed, 8 insertions, 2 deletions
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 56d97e22a..d48115dd0 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -210,11 +210,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
- // Split to D and start doing pairwise.
+#if defined(__aarch64__)
+ *min = *max = 0; // Clear high bits
+ *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+ *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+ // Split into 64-bit vectors and execute pairwise min/max.
uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
- // Enough runs of vpmax/min propogate the max/min values to every position.
+ // Enough runs of vpmax/min propagate the max/min values to every position.
ab_max = vpmax_u8(ab_max, ab_max);
ab_min = vpmin_u8(ab_min, ab_min);
@@ -228,4 +233,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
// Store directly to avoid costly neon->gpr transfer.
vst1_lane_u8((uint8_t *)max, ab_max, 0);
vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
}