aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdeps/aarch64/strrchr.S205
1 files changed, 91 insertions, 114 deletions
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S
index 94da08d351..a9b2bf47c2 100644
--- a/sysdeps/aarch64/strrchr.S
+++ b/sysdeps/aarch64/strrchr.S
@@ -24,142 +24,119 @@
*
* ARMv8-a, AArch64
* Neon Available.
+ * MTE compatible.
*/
/* Arguments and results. */
#define srcin x0
#define chrin w1
-
#define result x0
#define src x2
-#define tmp1 x3
-#define wtmp2 w4
-#define tmp3 x5
-#define src_match x6
-#define src_offset x7
-#define const_m1 x8
-#define tmp4 x9
-#define nul_match x10
-#define chr_match x11
+#define tmp x3
+#define wtmp w3
+#define synd x3
+#define shift x4
+#define src_match x4
+#define nul_match x5
+#define chr_match x6
#define vrepchr v0
-#define vdata1 v1
-#define vdata2 v2
-#define vhas_nul1 v3
-#define vhas_nul2 v4
-#define vhas_chr1 v5
-#define vhas_chr2 v6
-#define vrepmask_0 v7
-#define vrepmask_c v16
-#define vend1 v17
-#define vend2 v18
+#define vdata v1
+#define vhas_nul v2
+#define vhas_chr v3
+#define vrepmask v4
+#define vrepmask2 v5
+#define vend v5
+#define dend d5
/* Core algorithm.
- For each 32-byte hunk we calculate a 64-bit syndrome value, with
- two bits per byte (LSB is always in bits 0 and 1, for both big
- and little-endian systems). For each tuple, bit 0 is set iff
- the relevant byte matched the requested character; bit 1 is set
- iff the relevant byte matched the NUL end of string (we trigger
- off bit0 for the special case of looking for NUL). Since the bits
- in the syndrome reflect exactly the order in which things occur
- in the original string a count_trailing_zeros() operation will
- identify exactly which byte is causing the termination, and why. */
+ For each 16-byte chunk we calculate a 64-bit syndrome value, with
+ four bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bits 0-1 are set if
+ the relevant byte matched the requested character; bits 2-3 are set
+ if the relevant byte matched the NUL end of string. */
ENTRY(strrchr)
DELOUSE (0)
- cbz x1, L(null_search)
- /* Magic constant 0x40100401 to allow us to identify which lane
- matches the requested byte. Magic constant 0x80200802 used
- similarly for NUL termination. */
- mov wtmp2, #0x0401
- movk wtmp2, #0x4010, lsl #16
+ bic src, srcin, 15
dup vrepchr.16b, chrin
- bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
- dup vrepmask_c.4s, wtmp2
- mov src_offset, #0
- ands tmp1, srcin, #31
- add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
- b.eq L(aligned)
-
- /* Input string is not 32-byte aligned. Rather than forcing
- the padding bytes to a safe value, we calculate the syndrome
- for all the bytes, but then mask off those bits of the
- syndrome that are related to the padding. */
- ld1 {vdata1.16b, vdata2.16b}, [src], #32
- neg tmp1, tmp1
- cmeq vhas_nul1.16b, vdata1.16b, #0
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vhas_nul1.2d[0]
- lsl tmp1, tmp1, #1
- mov const_m1, #~0
- mov chr_match, vhas_chr1.2d[0]
- lsr tmp3, const_m1, tmp1
-
- bic nul_match, nul_match, tmp3 // Mask padding bits.
- bic chr_match, chr_match, tmp3 // Mask padding bits.
- cbnz nul_match, L(tail)
-
-L(loop):
- cmp chr_match, #0
- csel src_match, src, src_match, ne
- csel src_offset, chr_match, src_offset, ne
-L(aligned):
- ld1 {vdata1.16b, vdata2.16b}, [src], #32
- cmeq vhas_nul1.16b, vdata1.16b, #0
- cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
- cmeq vhas_nul2.16b, vdata2.16b, #0
- cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
- addp vend1.16b, vhas_nul1.16b, vhas_nul2.16b // 256->128
- and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
- and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128
- addp vend1.16b, vend1.16b, vend1.16b // 128->64
- addp vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b // 128->64
- mov nul_match, vend1.2d[0]
- mov chr_match, vhas_chr1.2d[0]
- cbz nul_match, L(loop)
-
- and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
- and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
- addp vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
- mov nul_match, vhas_nul1.2d[0]
+ mov wtmp, 0x3003
+ dup vrepmask.8h, wtmp
+ tst srcin, 15
+ beq L(loop1)
+
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ mov wtmp, 0xf00f
+ dup vrepmask2.8h, wtmp
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ lsl shift, srcin, 2
+ fmov synd, dend
+ lsr synd, synd, shift
+ lsl synd, synd, shift
+ ands nul_match, synd, 0xcccccccccccccccc
+ bne L(tail)
+ cbnz synd, L(loop2)
+
+ .p2align 5
+L(loop1):
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ cbz synd, L(loop1)
+
+ cmeq vhas_nul.16b, vdata.16b, 0
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ ands nul_match, synd, 0xcccccccccccccccc
+ beq L(loop2)
L(tail):
- /* Work out exactly where the string ends. */
- sub tmp4, nul_match, #1
- eor tmp4, tmp4, nul_match
- ands chr_match, chr_match, tmp4
- /* And pick the values corresponding to the last match. */
- csel src_match, src, src_match, ne
- csel src_offset, chr_match, src_offset, ne
-
- /* Count down from the top of the syndrome to find the last match. */
- clz tmp3, src_offset
- /* Src_match points beyond the word containing the match, so we can
- simply subtract half the bit-offset into the syndrome. Because
- we are counting down, we need to go back one more character. */
- add tmp3, tmp3, #2
- sub result, src_match, tmp3, lsr #1
- /* But if the syndrome shows no match was found, then return NULL. */
- cmp src_offset, #0
+ sub nul_match, nul_match, 1
+ and chr_match, synd, 0x3333333333333333
+ ands chr_match, chr_match, nul_match
+ sub result, src, 1
+ clz tmp, chr_match
+ sub result, result, tmp, lsr 2
csel result, result, xzr, ne
+ ret
+ .p2align 4
+L(loop2):
+ cmp synd, 0
+ csel src_match, src, src_match, ne
+ csel chr_match, synd, chr_match, ne
+ ld1 {vdata.16b}, [src], 16
+ cmeq vhas_nul.16b, vdata.16b, 0
+ cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
+ bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b
+ umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ tst synd, 0xcccccccccccccccc
+ beq L(loop2)
+
+ bic vhas_nul.8h, 0x0f, lsl 8
+ addp vend.16b, vhas_nul.16b, vhas_nul.16b
+ fmov synd, dend
+ and nul_match, synd, 0xcccccccccccccccc
+ sub nul_match, nul_match, 1
+ and tmp, synd, 0x3333333333333333
+ ands tmp, tmp, nul_match
+ csel chr_match, tmp, chr_match, ne
+ csel src_match, src, src_match, ne
+ sub src_match, src_match, 1
+ clz tmp, chr_match
+ sub result, src_match, tmp, lsr 2
ret
-L(null_search):
- b __strchrnul
END(strrchr)
weak_alias (strrchr, rindex)