diff options
author | Alan Modra <amodra@gmail.com> | 2013-08-17 18:46:05 +0930 |
---|---|---|
committer | Alan Modra <amodra@gmail.com> | 2013-10-04 10:40:22 +0930 |
commit | 664318c3eb07032e2bfcf47cb2aa3c89280c19e7 (patch) | |
tree | 338e8a4e5b1215319560caa795ce5830f2f46685 /sysdeps/powerpc/powerpc32/strchr.S | |
parent | 43b84013714c46e6dcae4a5564c5527777ad5e08 (diff) | |
download | glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.gz glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.tar.bz2 glibc-664318c3eb07032e2bfcf47cb2aa3c89280c19e7.zip |
PowerPC LE strchr
http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html
Adds little-endian support to optimised strchr assembly. I've also
tweaked the big-endian code a little. In power7/strchr.S there's a
check in the tail of the function that we didn't match 0 before
finding a c match, done by comparing leading zero counts. It's just
as valid, and quicker, to compare the raw output from cmpb.
Another little tweak is to use rldimi/insrdi in place of rlwimi for
the power7 strchr functions. Since rlwimi is cracked, it is a few
cycles slower. rldimi can be used on the 32-bit power7 functions
too.
* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
support. Correct typos, formatting. Optimize tail. Use insrdi
rather than rlwimi.
* sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
little-endian support. Correct typos.
* sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise. Use insrdi
rather than rlwimi.
* sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define. Use
in loop and entry code to keep "and." results.
(strchr): Add little-endian support. Comment. Move cntlzd
earlier in tail.
* sysdeps/powerpc/powerpc32/strchr.S: Likewise.
Diffstat (limited to 'sysdeps/powerpc/powerpc32/strchr.S')
-rw-r--r-- | sysdeps/powerpc/powerpc32/strchr.S | 71 |
1 files changed, 51 insertions, 20 deletions
diff --git a/sysdeps/powerpc/powerpc32/strchr.S b/sysdeps/powerpc/powerpc32/strchr.S index c9952eeccf..6050565770 100644 --- a/sysdeps/powerpc/powerpc32/strchr.S +++ b/sysdeps/powerpc/powerpc32/strchr.S @@ -36,6 +36,8 @@ ENTRY (strchr) #define rIGN r10 /* number of bits we should ignore in the first word */ #define rMASK r11 /* mask with the bits to ignore set to 0 */ #define rTMP3 r12 +#define rTMP4 rIGN +#define rTMP5 rMASK rlwimi rCHR, rCHR, 8, 16, 23 @@ -49,64 +51,93 @@ ENTRY (strchr) addi r7F7F, r7F7F, 0x7f7f /* Test the first (partial?) word. */ lwz rWORD, 0(rSTR) +#ifdef __LITTLE_ENDIAN__ + slw rMASK, rMASK, rIGN +#else srw rMASK, rMASK, rIGN +#endif orc rWORD, rWORD, rMASK add rTMP1, rFEFE, rWORD nor rTMP2, r7F7F, rWORD - and. rTMP1, rTMP1, rTMP2 + and. rTMP4, rTMP1, rTMP2 xor rTMP3, rCHR, rWORD orc rTMP3, rTMP3, rMASK b L(loopentry) /* The loop. */ -L(loop):lwzu rWORD, 4(rSTR) - and. rTMP1, rTMP1, rTMP2 +L(loop): + lwzu rWORD, 4(rSTR) + and. rTMP5, rTMP1, rTMP2 /* Test for 0. */ - add rTMP1, rFEFE, rWORD - nor rTMP2, r7F7F, rWORD + add rTMP1, rFEFE, rWORD /* x - 0x01010101. */ + nor rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080. */ bne L(foundit) - and. rTMP1, rTMP1, rTMP2 + and. rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080. */ /* Start test for the bytes we're looking for. */ xor rTMP3, rCHR, rWORD L(loopentry): add rTMP1, rFEFE, rTMP3 nor rTMP2, r7F7F, rTMP3 beq L(loop) + /* There is a zero byte in the word, but may also be a matching byte (either before or after the zero byte). In fact, we may be looking for a - zero byte, in which case we return a match. We guess that this hasn't - happened, though. */ -L(missed): - and. rTMP1, rTMP1, rTMP2 + zero byte, in which case we return a match. */ + and. rTMP5, rTMP1, rTMP2 li rRTN, 0 beqlr -/* It did happen. Decide which one was first... - I'm not sure if this is actually faster than a sequence of - rotates, compares, and branches (we use it anyway because it's shorter). */ +/* At this point: + rTMP5 bytes are 0x80 for each match of c, 0 otherwise. + rTMP4 bytes are 0x80 for each match of 0, 0 otherwise. + But there may be false matches in the next most significant byte from + a true match due to carries. This means we need to recalculate the + matches using a longer method for big-endian. */ +#ifdef __LITTLE_ENDIAN__ + addi rTMP1, rTMP5, -1 + andc rTMP1, rTMP1, rTMP5 + cntlzw rCLZB, rTMP1 + addi rTMP2, rTMP4, -1 + andc rTMP2, rTMP2, rTMP4 + cmplw rTMP1, rTMP2 + bgtlr + subfic rCLZB, rCLZB, 32-7 +#else +/* I think we could reduce this by two instructions by keeping the "nor" + results from the loop for reuse here. See strlen.S tail. Similarly + one instruction could be pruned from L(foundit). */ and rFEFE, r7F7F, rWORD - or rMASK, r7F7F, rWORD + or rTMP5, r7F7F, rWORD and rTMP1, r7F7F, rTMP3 - or rIGN, r7F7F, rTMP3 + or rTMP4, r7F7F, rTMP3 add rFEFE, rFEFE, r7F7F add rTMP1, rTMP1, r7F7F - nor rWORD, rMASK, rFEFE - nor rTMP2, rIGN, rTMP1 + nor rWORD, rTMP5, rFEFE + nor rTMP2, rTMP4, rTMP1 + cntlzw rCLZB, rTMP2 cmplw rWORD, rTMP2 bgtlr - cntlzw rCLZB, rTMP2 +#endif srwi rCLZB, rCLZB, 3 add rRTN, rSTR, rCLZB blr L(foundit): +#ifdef __LITTLE_ENDIAN__ + addi rTMP1, rTMP5, -1 + andc rTMP1, rTMP1, rTMP5 + cntlzw rCLZB, rTMP1 + subfic rCLZB, rCLZB, 32-7-32 + srawi rCLZB, rCLZB, 3 +#else and rTMP1, r7F7F, rTMP3 - or rIGN, r7F7F, rTMP3 + or rTMP4, r7F7F, rTMP3 add rTMP1, rTMP1, r7F7F - nor rTMP2, rIGN, rTMP1 + nor rTMP2, rTMP4, rTMP1 cntlzw rCLZB, rTMP2 subi rSTR, rSTR, 4 srwi rCLZB, rCLZB, 3 +#endif add rRTN, rSTR, rCLZB blr END (strchr) |