aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2023-01-11 13:52:08 +0000
committerWilco Dijkstra <wilco.dijkstra@arm.com>2023-01-17 15:09:18 +0000
commit51541a229740801882490177fa178e49264b13fb (patch)
tree98b0dc945b5ff82427af70fc58773449941afe84
parent1bbb1a2022e126f21810d3d0ebe0a975d5243e43 (diff)
downloadglibc-51541a229740801882490177fa178e49264b13fb.tar
glibc-51541a229740801882490177fa178e49264b13fb.tar.gz
glibc-51541a229740801882490177fa178e49264b13fb.tar.bz2
glibc-51541a229740801882490177fa178e49264b13fb.zip
AArch64: Optimize strchr
Simplify calculation of the mask using shrn. Unroll the main loop. Small strings are 20% faster on modern CPUs. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
-rw-r--r--sysdeps/aarch64/strchr.S52
1 files changed, 24 insertions, 28 deletions
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S
index 900ef15944..14ae1513a7 100644
--- a/sysdeps/aarch64/strchr.S
+++ b/sysdeps/aarch64/strchr.S
@@ -32,8 +32,7 @@
#define src x2
#define tmp1 x1
-#define wtmp2 w3
-#define tmp3 x3
+#define tmp2 x3
#define vrepchr v0
#define vdata v1
@@ -41,39 +40,30 @@
#define vhas_nul v2
#define vhas_chr v3
#define vrepmask v4
-#define vrepmask2 v5
-#define vend v6
-#define dend d6
+#define vend v5
+#define dend d5
/* Core algorithm.
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
- per byte. For even bytes, bits 0-1 are set if the relevant byte matched the
- requested character, bits 2-3 are set if the byte is NUL (or matched), and
- bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd
- bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits
- in the syndrome reflect the order in which things occur in the original
- string, counting trailing zeros identifies exactly which byte matched. */
+ per byte. Bits 0-1 are set if the relevant byte matched the requested
+ character, bits 2-3 are set if the byte is NUL or matched. Count trailing
+ zeroes gives the position of the matching byte if it is a multiple of 4.
+ If it is not a multiple of 4, there was no match. */
ENTRY (strchr)
PTR_ARG (0)
bic src, srcin, 15
dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src]
- mov wtmp2, 0x3003
- dup vrepmask.8h, wtmp2
+ movi vrepmask.16b, 0x33
cmeq vhas_nul.16b, vdata.16b, 0
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
- mov wtmp2, 0xf00f
- dup vrepmask2.8h, wtmp2
-
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- lsl tmp3, srcin, 2
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
-
+ lsl tmp2, srcin, 2
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
- lsr tmp1, tmp1, tmp3
+ lsr tmp1, tmp1, tmp2
cbz tmp1, L(loop)
rbit tmp1, tmp1
@@ -87,28 +77,34 @@ ENTRY (strchr)
.p2align 4
L(loop):
- ldr qdata, [src, 16]!
+ ldr qdata, [src, 16]
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov tmp1, dend
+ cbnz tmp1, L(end)
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov tmp1, dend
cbz tmp1, L(loop)
+ sub src, src, 16
+L(end):
#ifdef __AARCH64EB__
bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
#else
bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
- and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
- addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov tmp1, dend
rbit tmp1, tmp1
#endif
+ add src, src, 16
clz tmp1, tmp1
- /* Tmp1 is an even multiple of 2 if the target character was
- found first. Otherwise we've found the end of string. */
+ /* Tmp1 is a multiple of 4 if the target character was found. */
tst tmp1, 2
add result, src, tmp1, lsr 2
csel result, result, xzr, eq