diff options
Diffstat (limited to 'sysdeps/powerpc')
-rw-r--r-- | sysdeps/powerpc/memset.S | 273 | ||||
-rw-r--r-- | sysdeps/powerpc/strchr.S | 138 | ||||
-rw-r--r-- | sysdeps/powerpc/strcmp.S | 132 | ||||
-rw-r--r-- | sysdeps/powerpc/strcpy.S | 110 | ||||
-rw-r--r-- | sysdeps/powerpc/strlen.S | 126 |
5 files changed, 401 insertions, 378 deletions
diff --git a/sysdeps/powerpc/memset.S b/sysdeps/powerpc/memset.S index 1b95bc7591..c48c0af7c8 100644 --- a/sysdeps/powerpc/memset.S +++ b/sysdeps/powerpc/memset.S @@ -19,181 +19,192 @@ #include <sysdep.h> -EALIGN(memset,5,1) /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); Returns 's'. The memset is done in three sizes: byte (8 bits), word (32 bits), cache line (256 bits). There is a special case for setting cache lines - to 0, to take advantage of the dcbz instruction. - r6: current address we are storing at - r7: number of bytes we are setting now (when aligning) */ + to 0, to take advantage of the dcbz instruction. */ + +EALIGN (memset, 5, 1) + +#define rTMP r0 +#define rRTN r3 /* initial value of 1st argument */ +#define rCHR r4 /* char to set in each byte */ +#define rLEN r5 /* length of region to set */ +#define rMEMP r6 /* address at which we are storing */ +#define rALIGN r7 /* number of bytes we are setting now (when aligning) */ +#define rMEMP2 r8 + +#define rPOS32 r7 /* constant +32 for clearing with dcbz */ +#define rNEG64 r8 /* constant -64 for clearing with dcbz */ +#define rNEG32 r9 /* constant -32 for clearing with dcbz */ /* take care of case for size <= 4 */ - cmplwi cr1,r5,4 - andi. r7,r3,3 - mr r6,r3 - ble- cr1,L(small) + cmplwi cr1, rLEN, 4 + andi. rALIGN, rRTN, 3 + mr rMEMP, rRTN + ble- cr1, L(small) /* align to word boundary */ - cmplwi cr5,r5,31 - rlwimi r4,r4,8,16,23 - beq+ L(aligned) # 8th instruction from .align - mtcrf 0x01,r3 - subfic r7,r7,4 - add r6,r6,r7 - sub r5,r5,r7 - bf+ 31,L(g0) - stb r4,0(r3) - bt 30,L(aligned) -L(g0): sth r4,-2(r6) # 16th instruction from .align + cmplwi cr5, rLEN, 31 + rlwimi rCHR, rCHR, 8, 16, 23 + beq+ L(aligned) /* 8th instruction from .align */ + mtcrf 0x01, rRTN + subfic rALIGN, rALIGN, 4 + add rMEMP, rMEMP, rALIGN + sub rLEN, rLEN, rALIGN + bf+ 31, L(g0) + stb rCHR, 0(rRTN) + bt 30, L(aligned) +L(g0): sth rCHR, -2(rMEMP) /* 16th instruction from .align */ /* take care of case for size < 31 */ L(aligned): - mtcrf 0x01,r5 - rlwimi r4,r4,16,0,15 - ble cr5,L(medium) + mtcrf 0x01, rLEN + rlwimi rCHR, rCHR, 16, 0, 15 + ble cr5, L(medium) /* align to cache line boundary... */ - andi. r7,r6,0x1C - subfic r7,r7,0x20 - beq L(caligned) - mtcrf 0x01,r7 - add r6,r6,r7 - sub r5,r5,r7 - cmplwi cr1,r7,0x10 - mr r8,r6 - bf 28,L(a1) - stw r4,-4(r8) - stwu r4,-8(r8) -L(a1): blt cr1,L(a2) - stw r4,-4(r8) # 32nd instruction from .align - stw r4,-8(r8) - stw r4,-12(r8) - stwu r4,-16(r8) -L(a2): bf 29,L(caligned) - stw r4,-4(r8) + andi. rALIGN, rMEMP, 0x1C + subfic rALIGN, rALIGN, 0x20 + beq L(caligned) + mtcrf 0x01, rALIGN + add rMEMP, rMEMP, rALIGN + sub rLEN, rLEN, rALIGN + cmplwi cr1, rALIGN, 0x10 + mr rMEMP2, rMEMP + bf 28, L(a1) + stw rCHR, -4(rMEMP2) + stwu rCHR, -8(rMEMP2) +L(a1): blt cr1, L(a2) + stw rCHR, -4(rMEMP2) /* 32nd instruction from .align */ + stw rCHR, -8(rMEMP2) + stw rCHR, -12(rMEMP2) + stwu rCHR, -16(rMEMP2) +L(a2): bf 29, L(caligned) + stw rCHR, -4(rMEMP2) /* now aligned to a cache line. */ L(caligned): - cmplwi cr1,r4,0 - clrrwi. r7,r5,5 - mtcrf 0x01,r5 # 40th instruction from .align - beq cr1,L(zloopstart) # special case for clearing memory using dcbz - srwi r0,r7,5 - mtctr r0 - beq L(medium) # we may not actually get to do a full line - clrlwi. r5,r5,27 - add r6,r6,r7 - li r8,-0x40 - bdz L(cloopdone) # 48th instruction from .align + cmplwi cr1, rCHR, 0 + clrrwi. rALIGN, rLEN, 5 + mtcrf 0x01, rLEN /* 40th instruction from .align */ + beq cr1, L(zloopstart) /* special case for clearing memory using dcbz */ + srwi rTMP, rALIGN, 5 + mtctr rTMP + beq L(medium) /* we may not actually get to do a full line */ + clrlwi. rLEN, rLEN, 27 + add rMEMP, rMEMP, rALIGN + li rNEG64, -0x40 + bdz L(cloopdone) /* 48th instruction from .align */ -L(c3): dcbz r8,r6 - stw r4,-4(r6) - stw r4,-8(r6) - stw r4,-12(r6) - stw r4,-16(r6) - nop # let 601 fetch last 4 instructions of loop - stw r4,-20(r6) - stw r4,-24(r6) # 56th instruction from .align - nop # let 601 fetch first 8 instructions of loop - stw r4,-28(r6) - stwu r4,-32(r6) - bdnz L(c3) +L(c3): dcbz rNEG64, rMEMP + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + stw rCHR, -12(rMEMP) + stw rCHR, -16(rMEMP) + nop /* let 601 fetch last 4 instructions of loop */ + stw rCHR, -20(rMEMP) + stw rCHR, -24(rMEMP) /* 56th instruction from .align */ + nop /* let 601 fetch first 8 instructions of loop */ + stw rCHR, -28(rMEMP) + stwu rCHR, -32(rMEMP) + bdnz L(c3) L(cloopdone): - stw r4,-4(r6) - stw r4,-8(r6) - stw r4,-12(r6) - stw r4,-16(r6) # 64th instruction from .align - stw r4,-20(r6) - cmplwi cr1,r5,16 - stw r4,-24(r6) - stw r4,-28(r6) - stwu r4,-32(r6) + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + stw rCHR, -12(rMEMP) + stw rCHR, -16(rMEMP) /* 64th instruction from .align */ + stw rCHR, -20(rMEMP) + cmplwi cr1, rLEN, 16 + stw rCHR, -24(rMEMP) + stw rCHR, -28(rMEMP) + stwu rCHR, -32(rMEMP) beqlr - add r6,r6,r7 - b L(medium_tail2) # 72nd instruction from .align + add rMEMP, rMEMP, rALIGN + b L(medium_tail2) /* 72nd instruction from .align */ .align 5 nop /* Clear lines of memory in 128-byte chunks. */ L(zloopstart): - clrlwi r5,r5,27 - mtcrf 0x02,r7 - srwi. r0,r7,7 - mtctr r0 - li r7,0x20 - li r8,-0x40 - cmplwi cr1,r5,16 # 8 - bf 26,L(z0) - dcbz 0,r6 - addi r6,r6,0x20 -L(z0): li r9,-0x20 - bf 25,L(z1) - dcbz 0,r6 - dcbz r7,r6 - addi r6,r6,0x40 # 16 -L(z1): cmplwi cr5,r5,0 - beq L(medium) + clrlwi rLEN, rLEN, 27 + mtcrf 0x02, rALIGN + srwi. rTMP, rALIGN, 7 + mtctr rTMP + li rPOS32, 0x20 + li rNEG64, -0x40 + cmplwi cr1, rLEN, 16 /* 8 */ + bf 26, L(z0) + dcbz 0, rMEMP + addi rMEMP, rMEMP, 0x20 +L(z0): li rNEG32, -0x20 + bf 25, L(z1) + dcbz 0, rMEMP + dcbz rPOS32, rMEMP + addi rMEMP, rMEMP, 0x40 /* 16 */ +L(z1): cmplwi cr5, rLEN, 0 + beq L(medium) L(zloop): - dcbz 0,r6 - dcbz r7,r6 - addi r6,r6,0x80 - dcbz r8,r6 - dcbz r9,r6 - bdnz L(zloop) - beqlr cr5 - b L(medium_tail2) + dcbz 0, rMEMP + dcbz rPOS32, rMEMP + addi rMEMP, rMEMP, 0x80 + dcbz rNEG64, rMEMP + dcbz rNEG32, rMEMP + bdnz L(zloop) + beqlr cr5 + b L(medium_tail2) .align 5 L(small): /* Memset of 4 bytes or less. */ - cmplwi cr5,r5,1 - cmplwi cr1,r5,3 - bltlr cr5 - stb r4,0(r6) - beqlr cr5 + cmplwi cr5, rLEN, 1 + cmplwi cr1, rLEN, 3 + bltlr cr5 + stb rCHR, 0(rMEMP) + beqlr cr5 nop - stb r4,1(r6) - bltlr cr1 - stb r4,2(r6) - beqlr cr1 + stb rCHR, 1(rMEMP) + bltlr cr1 + stb rCHR, 2(rMEMP) + beqlr cr1 nop - stb r4,3(r6) + stb rCHR, 3(rMEMP) blr /* Memset of 0-31 bytes. */ .align 5 L(medium): - cmplwi cr1,r5,16 + cmplwi cr1, rLEN, 16 L(medium_tail2): - add r6,r6,r5 + add rMEMP, rMEMP, rLEN L(medium_tail): - bt- 31,L(medium_31t) - bt- 30,L(medium_30t) + bt- 31, L(medium_31t) + bt- 30, L(medium_30t) L(medium_30f): - bt- 29,L(medium_29t) + bt- 29, L(medium_29t) L(medium_29f): - bge- cr1,L(medium_27t) - bflr- 28 - stw r4,-4(r6) # 8th instruction from .align - stw r4,-8(r6) + bge- cr1, L(medium_27t) + bflr- 28 + stw rCHR, -4(rMEMP) /* 8th instruction from .align */ + stw rCHR, -8(rMEMP) blr L(medium_31t): - stbu r4,-1(r6) - bf- 30,L(medium_30f) + stbu rCHR, -1(rMEMP) + bf- 30, L(medium_30f) L(medium_30t): - sthu r4,-2(r6) - bf- 29,L(medium_29f) + sthu rCHR, -2(rMEMP) + bf- 29, L(medium_29f) L(medium_29t): - stwu r4,-4(r6) - blt- cr1,L(medium_27f) # 16th instruction from .align + stwu rCHR, -4(rMEMP) + blt- cr1, L(medium_27f) /* 16th instruction from .align */ L(medium_27t): - stw r4,-4(r6) - stw r4,-8(r6) - stw r4,-12(r6) - stwu r4,-16(r6) + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) + stw rCHR, -12(rMEMP) + stwu rCHR, -16(rMEMP) L(medium_27f): - bflr- 28 + bflr- 28 L(medium_28t): - stw r4,-4(r6) - stw r4,-8(r6) + stw rCHR, -4(rMEMP) + stw rCHR, -8(rMEMP) blr END(memset) diff --git a/sysdeps/powerpc/strchr.S b/sysdeps/powerpc/strchr.S index 5367a5a4fe..4662447179 100644 --- a/sysdeps/powerpc/strchr.S +++ b/sysdeps/powerpc/strchr.S @@ -1,5 +1,5 @@ /* Optimized strchr implementation for PowerPC. - Copyright (C) 1997, 1999 Free Software Foundation, Inc. + Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,91 +21,95 @@ /* See strlen.s for comments on how this works. */ -/* char * [r3] strchr (const char *s [r3] , int c [r4] ) +/* char * [r3] strchr (const char *s [r3] , int c [r4] ) */ - r0: a temporary - r3: our return result. - r4: byte we're looking for, spread over the whole word - r5: the current word - r6: the constant 0xfefefeff (-0x01010101) - r7: the constant 0x7f7f7f7f - r8: pointer to the current word. - r9: a temporary - r10: the number of bits we should ignore in the first word - r11: a mask with the bits to ignore set to 0 - r12: a temporary */ -ENTRY(strchr) - rlwimi r4,r4,8,16,23 - li r11,-1 - rlwimi r4,r4,16,0,15 - lis r6,0xfeff - lis r7,0x7f7f - clrrwi r8,r3,2 - addi r7,r7,0x7f7f - addi r6,r6,0xfffffeff - rlwinm r10,r3,3,27,28 +ENTRY (strchr) + +#define rTMP1 r0 +#define rRTN r3 /* outgoing result */ +#define rSTRin r3 /* incoming string arg */ +#define rCHR r4 /* byte we're looking for, spread over the whole word */ +#define rCLZB rCHR /* leading zero byte count */ +#define rWORD r5 /* the current word */ +#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ +#define r7F7F r7 /* constant 0x7f7f7f7f */ +#define rSTR r8 /* current word pointer */ +#define rTMP2 r9 +#define rIGN r10 /* number of bits we should ignore in the first word */ +#define rMASK r11 /* mask with the bits to ignore set to 0 */ +#define rTMP3 r12 + + rlwimi rCHR, rCHR, 8, 16, 23 + li rMASK, -1 + rlwimi rCHR, rCHR, 16, 0, 15 + lis rFEFE, -0x101 + lis r7F7F, 0x7f7f + clrrwi rSTR, rSTRin, 2 + addi r7F7F, r7F7F, 0x7f7f + addi rFEFE, rFEFE, -0x101 + rlwinm rIGN, rSTRin, 3, 27, 28 /* Test the first (partial?) word. */ - lwz r5,0(r8) - srw r11,r11,r10 - orc r5,r5,r11 - add r0,r6,r5 - nor r9,r7,r5 - and. r0,r0,r9 - xor r12,r4,r5 - orc r12,r12,r11 - b L(loopentry) + lwz rWORD, 0(rSTR) + srw rMASK, rMASK, rIGN + orc rWORD, rWORD, rMASK + add rTMP1, rFEFE, rWORD + nor rTMP2, r7F7F, rWORD + and. rTMP1, rTMP1, rTMP2 + xor rTMP3, rCHR, rWORD + orc rTMP3, rTMP3, rMASK + b L(loopentry) /* The loop. */ -L(loop):lwzu r5,4(r8) - and. r0,r0,r9 -/* Test for 0. */ - add r0,r6,r5 - nor r9,r7,r5 - bne L(foundit) - and. r0,r0,r9 +L(loop):lwzu rWORD, 4(rSTR) + and. rTMP1, rTMP1, rTMP2 +/* Test for 0. */ + add rTMP1, rFEFE, rWORD + nor rTMP2, r7F7F, rWORD + bne L(foundit) + and. rTMP1, rTMP1, rTMP2 /* Start test for the bytes we're looking for. */ - xor r12,r4,r5 + xor rTMP3, rCHR, rWORD L(loopentry): - add r0,r6,r12 - nor r9,r7,r12 - beq L(loop) + add rTMP1, rFEFE, rTMP3 + nor rTMP2, r7F7F, rTMP3 + beq L(loop) /* There is a zero byte in the word, but may also be a matching byte (either before or after the zero byte). In fact, we may be looking for a zero byte, in which case we return a match. We guess that this hasn't happened, though. */ L(missed): - and. r0,r0,r9 - li r3,0 + and. rTMP1, rTMP1, rTMP2 + li rRTN, 0 beqlr /* It did happen. Decide which one was first... I'm not sure if this is actually faster than a sequence of rotates, compares, and branches (we use it anyway because it's shorter). */ - and r6,r7,r5 - or r11,r7,r5 - and r0,r7,r12 - or r10,r7,r12 - add r6,r6,r7 - add r0,r0,r7 - nor r5,r11,r6 - nor r9,r10,r0 - cmplw r5,r9 + and rFEFE, r7F7F, rWORD + or rMASK, r7F7F, rWORD + and rTMP1, r7F7F, rTMP3 + or rIGN, r7F7F, rTMP3 + add rFEFE, rFEFE, r7F7F + add rTMP1, rTMP1, r7F7F + nor rWORD, rMASK, rFEFE + nor rTMP2, rIGN, rTMP1 + cmplw rWORD, rTMP2 bgtlr - cntlzw r4,r9 - srwi r4,r4,3 - add r3,r8,r4 + cntlzw rCLZB, rTMP2 + srwi rCLZB, rCLZB, 3 + add rRTN, rSTR, rCLZB blr L(foundit): - and r0,r7,r12 - or r10,r7,r12 - add r0,r0,r7 - nor r9,r10,r0 - cntlzw r4,r9 - subi r8,r8,4 - srwi r4,r4,3 - add r3,r8,r4 + and rTMP1, r7F7F, rTMP3 + or rIGN, r7F7F, rTMP3 + add rTMP1, rTMP1, r7F7F + nor rTMP2, rIGN, rTMP1 + cntlzw rCLZB, rTMP2 + subi rSTR, rSTR, 4 + srwi rCLZB, rCLZB, 3 + add rRTN, rSTR, rCLZB blr -END(strchr) +END (strchr) -weak_alias(strchr,index) +weak_alias(strchr, index) diff --git a/sysdeps/powerpc/strcmp.S b/sysdeps/powerpc/strcmp.S index 92e9858d13..1accdd70cb 100644 --- a/sysdeps/powerpc/strcmp.S +++ b/sysdeps/powerpc/strcmp.S @@ -21,95 +21,93 @@ /* See strlen.s for comments on how the end-of-string testing works. */ -EALIGN(strcmp,4,0) -/* int [r3] strcmp (const char *p1 [r3], const char *p2 [r4]) */ +/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ -/* General register assignments: - r0: temporary - r3: pointer to previous word in s1 - r4: pointer to previous word in s2 - r5: current word from s1 - r6: current word from s2 - r7: 0xfefefeff - r8: 0x7f7f7f7f - r9: ~(word in s1 | 0x7f7f7f7f) */ +EALIGN (strcmp, 4, 0) -/* Register assignments in the prologue: - r10: low 2 bits of p2-p1 - r11: mask to orc with r5/r6 */ +#define rTMP r0 +#define rRTN r3 /* return value */ +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rWORD1 r5 /* current word in s1 */ +#define rWORD2 r6 /* current word in s2 */ +#define rFEFE r7 /* constant 0xfefefeff (-0x01010101) */ +#define r7F7F r8 /* constant 0x7f7f7f7f */ +#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */ +#define rBITDIF r10 /* bits that differ in s1 & s2 words */ - or r0,r4,r3 - clrlwi. r0,r0,30 - lis r7,0xfeff - bne L(unaligned) + or rTMP, rSTR2, rSTR1 + clrlwi. rTMP, rTMP, 30 + lis rFEFE, -0x101 + bne L(unaligned) - lwz r5,0(r3) - lwz r6,0(r4) - lis r8,0x7f7f - addi r7,r7,-0x101 - addi r8,r8,0x7f7f - b L(g1) + lwz rWORD1, 0(rSTR1) + lwz rWORD2, 0(rSTR2) + lis r7F7F, 0x7f7f + addi rFEFE, rFEFE, -0x101 + addi r7F7F, r7F7F, 0x7f7f + b L(g1) -L(g0): lwzu r5,4(r3) - bne cr1,L(different) - lwzu r6,4(r4) -L(g1): add r0,r7,r5 - nor r9,r8,r5 - and. r0,r0,r9 - cmpw cr1,r5,r6 - beq+ L(g0) +L(g0): lwzu rWORD1, 4(rSTR1) + bne cr1, L(different) + lwzu rWORD2, 4(rSTR2) +L(g1): add rTMP, rFEFE, rWORD1 + nor rNEG, r7F7F, rWORD1 + and. rTMP, rTMP, rNEG + cmpw cr1, rWORD1, rWORD2 + beq+ L(g0) L(endstring): /* OK. We've hit the end of the string. We need to be careful that we don't compare two strings as different because of gunk beyond the end of the strings... */ - and r0,r8,r5 - beq cr1,L(equal) - add r0,r0,r8 - xor. r10,r5,r6 - andc r9,r9,r0 - blt- L(highbit) - cntlzw r10,r10 - cntlzw r9,r9 - addi r9,r9,7 - cmpw cr1,r9,r10 - sub r3,r5,r6 - bgelr+ cr1 + and rTMP, r7F7F, rWORD1 + beq cr1, L(equal) + add rTMP, rTMP, r7F7F + xor. rBITDIF, rWORD1, rWORD2 + andc rNEG, rNEG, rTMP + blt- L(highbit) + cntlzw rBITDIF, rBITDIF + cntlzw rNEG, rNEG + addi rNEG, rNEG, 7 + cmpw cr1, rNEG, rBITDIF + sub rRTN, rWORD1, rWORD2 + bgelr+ cr1 L(equal): - li r3,0 + li rRTN, 0 blr L(different): - lwz r5,-4(r3) - xor. r10,r5,r6 - sub r3,r5,r6 + lwz rWORD1, -4(rSTR1) + xor. rBITDIF, rWORD1, rWORD2 + sub rRTN, rWORD1, rWORD2 bgelr+ L(highbit): - ori r3,r6,1 + ori rRTN, rWORD2, 1 blr /* Oh well. In this case, we just do a byte-by-byte comparison. */ .align 4 L(unaligned): - lbz r5,0(r3) - lbz r6,0(r4) - b L(u1) + lbz rWORD1, 0(rSTR1) + lbz rWORD2, 0(rSTR2) + b L(u1) -L(u0): lbzu r5,1(r3) - bne- L(u4) - lbzu r6,1(r4) -L(u1): cmpwi cr1,r5,0 - beq- cr1,L(u3) - cmpw r5,r6 - bne- L(u3) - lbzu r5,1(r3) - lbzu r6,1(r4) - cmpwi cr1,r5,0 - cmpw r5,r6 - bne+ cr1,L(u0) -L(u3): sub r3,r5,r6 +L(u0): lbzu rWORD1, 1(rSTR1) + bne- L(u4) + lbzu rWORD2, 1(rSTR2) +L(u1): cmpwi cr1, rWORD1, 0 + beq- cr1, L(u3) + cmpw rWORD1, rWORD2 + bne- L(u3) + lbzu rWORD1, 1(rSTR1) + lbzu rWORD2, 1(rSTR2) + cmpwi cr1, rWORD1, 0 + cmpw rWORD1, rWORD2 + bne+ cr1, L(u0) +L(u3): sub rRTN, rWORD1, rWORD2 blr -L(u4): lbz r5,-1(r3) - sub r3,r5,r6 +L(u4): lbz rWORD1, -1(rSTR1) + sub rRTN, rWORD1, rWORD2 blr END(strcmp) diff --git a/sysdeps/powerpc/strcpy.S b/sysdeps/powerpc/strcpy.S index 0767921d65..901ccf1259 100644 --- a/sysdeps/powerpc/strcpy.S +++ b/sysdeps/powerpc/strcpy.S @@ -21,80 +21,80 @@ /* See strlen.s for comments on how the end-of-string testing works. */ -EALIGN(strcpy,4,0) /* char * [r3] strcpy (char *dest [r3], const char *src [r4]) */ -/* General register assignments: - r0: temporary - r3: saved `dest' - r4: pointer to previous word in src - r5: pointer to previous word in dest - r6: current word from src - r7: 0xfefefeff - r8: 0x7f7f7f7f - r9: ~(word in src | 0x7f7f7f7f) - r10: alternate word from src. */ +EALIGN(strcpy, 4, 0) - or r0,r4,r3 - clrlwi. r0,r0,30 - addi r5,r3,-4 - bne L(unaligned) +#define rTMP r0 +#define rRTN r3 /* incoming DEST arg preserved as result */ +#define rSRC r4 /* pointer to previous word in src */ +#define rDEST r5 /* pointer to previous word in dest */ +#define rWORD r6 /* current word from src */ +#define rFEFE r7 /* constant 0xfefefeff (-0x01010101) */ +#define r7F7F r8 /* constant 0x7f7f7f7f */ +#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */ +#define rALT r10 /* alternate word from src */ - lis r7,0xfeff - lis r8,0x7f7f - lwz r6,0(r4) - addi r7,r7,-0x101 - addi r8,r8,0x7f7f - b L(g2) + or rTMP, rSRC, rRTN + clrlwi. rTMP, rTMP, 30 + addi rDEST, rRTN, -4 + bne L(unaligned) -L(g0): lwzu r10,4(r4) - stwu r6,4(r5) - add r0,r7,r10 - nor r9,r8,r10 - and. r0,r0,r9 - bne- L(g1) - lwzu r6,4(r4) - stwu r10,4(r5) -L(g2): add r0,r7,r6 - nor r9,r8,r6 - and. r0,r0,r9 - beq+ L(g0) + lis rFEFE, -0x101 + lis r7F7F, 0x7f7f + lwz rWORD, 0(rSRC) + addi rFEFE, rFEFE, -0x101 + addi r7F7F, r7F7F, 0x7f7f + b L(g2) - mr r10,r6 +L(g0): lwzu rALT, 4(rSRC) + stwu rWORD, 4(rDEST) + add rTMP, rFEFE, rALT + nor rNEG, r7F7F, rALT + and. rTMP, rTMP, rNEG + bne- L(g1) + lwzu rWORD, 4(rSRC) + stwu rALT, 4(rDEST) +L(g2): add rTMP, rFEFE, rWORD + nor rNEG, r7F7F, rWORD + and. rTMP, rTMP, rNEG + beq+ L(g0) + + mr rALT, rWORD /* We've hit the end of the string. Do the rest byte-by-byte. */ -L(g1): rlwinm. r0,r10,8,24,31 - stb r0,4(r5) +L(g1): rlwinm. rTMP, rALT, 8, 24, 31 + stb rTMP, 4(rDEST) beqlr- - rlwinm. r0,r10,16,24,31 - stb r0,5(r5) + rlwinm. rTMP, rALT, 16, 24, 31 + stb rTMP, 5(rDEST) beqlr- - rlwinm. r0,r10,24,24,31 - stb r0,6(r5) + rlwinm. rTMP, rALT, 24, 24, 31 + stb rTMP, 6(rDEST) beqlr- - stb r10,7(r5) + stb rALT, 7(rDEST) blr /* Oh well. In this case, we just do a byte-by-byte copy. */ .align 4 nop L(unaligned): - lbz r6,0(r4) - addi r5,r3,-1 - cmpwi r6,0 - beq- L(u2) + lbz rWORD, 0(rSRC) + addi rDEST, rRTN, -1 + cmpwi rWORD, 0 + beq- L(u2) -L(u0): lbzu r10,1(r4) - stbu r6,1(r5) - cmpwi r10,0 - beq- L(u1) +L(u0): lbzu rALT, 1(rSRC) + stbu rWORD, 1(rDEST) + cmpwi rALT, 0 + beq- L(u1) nop /* Let 601 load start of loop. */ - lbzu r6,1(r4) - stbu r10,1(r5) - cmpwi r6,0 - bne+ L(u0) -L(u2): stb r6,1(r5) + lbzu rWORD, 1(rSRC) + stbu rALT, 1(rDEST) + cmpwi rWORD, 0 + bne+ L(u0) +L(u2): stb rWORD, 1(rDEST) blr -L(u1): stb r10,1(r5) +L(u1): stb rALT, 1(rDEST) blr END(strcpy) diff --git a/sysdeps/powerpc/strlen.S b/sysdeps/powerpc/strlen.S index b847ee4df3..18e76238c0 100644 --- a/sysdeps/powerpc/strlen.S +++ b/sysdeps/powerpc/strlen.S @@ -1,5 +1,5 @@ /* Optimized strlen implementation for PowerPC. - Copyright (C) 1997, 1999 Free Software Foundation, Inc. + Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -69,76 +69,86 @@ We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving them, the others we must save. */ -ENTRY(strlen) -/* On entry, r3 points to the string, and it's left that way. - We use r6 to store 0xfefefeff, and r7 to store 0x7f7f7f7f. - r4 is used to keep the current index into the string; r5 holds - the number of padding bits we prepend to the string to make it - start at a word boundary. r8 holds the 'current' word. - r9-12 are temporaries. r0 is used as a temporary and for discarded - results. */ - clrrwi r4,r3,2 - lis r7,0x7f7f - rlwinm r5,r3,3,27,28 - lwz r8,0(r4) - li r9,-1 - addi r7,r7,0x7f7f +/* int [r3] strlen (char *s [r3]) */ + +ENTRY (strlen) + +#define rTMP1 r0 +#define rRTN r3 /* incoming STR arg, outgoing result */ +#define rSTR r4 /* current string position */ +#define rPADN r5 /* number of padding bits we prepend to the + string to make it start at a word boundary */ +#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ +#define r7F7F r7 /* constant 0x7f7f7f7f */ +#define rWORD1 r8 /* current string word */ +#define rWORD2 r9 /* next string word */ +#define rMASK r9 /* mask for first string word */ +#define rTMP2 r10 +#define rTMP3 r11 +#define rTMP4 r12 + + clrrwi rSTR, rRTN, 2 + lis r7F7F, 0x7f7f + rlwinm rPADN, rRTN, 3, 27, 28 + lwz rWORD1, 0(rSTR) + li rMASK, -1 + addi r7F7F, r7F7F, 0x7f7f /* That's the setup done, now do the first pair of words. We make an exception and use method (2) on the first two words, to reduce overhead. */ - srw r9,r9,r5 - and r0,r7,r8 - or r10,r7,r8 - add r0,r0,r7 - nor r0,r10,r0 - and. r8,r0,r9 - mtcrf 0x01,r3 - bne L(done0) - lis r6,0xfeff - addi r6,r6,-0x101 + srw rMASK, rMASK, rPADN + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F + nor rTMP1, rTMP2, rTMP1 + and. rWORD1, rTMP1, rMASK + mtcrf 0x01, rRTN + bne L(done0) + lis rFEFE, -0x101 + addi rFEFE, rFEFE, -0x101 /* Are we now aligned to a doubleword boundary? */ - bt 29,L(loop) + bt 29, L(loop) /* Handle second word of pair. */ - lwzu r8,4(r4) - and r0,r7,r8 - or r10,r7,r8 - add r0,r0,r7 - nor. r8,r10,r0 - bne L(done0) + lwzu rWORD1, 4(rSTR) + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F + nor. rWORD1, rTMP2, rTMP1 + bne L(done0) /* The loop. */ L(loop): - lwz r8,4(r4) - lwzu r9,8(r4) - add r0,r6,r8 - nor r10,r7,r8 - and. r0,r0,r10 - add r11,r6,r9 - nor r12,r7,r9 - bne L(done1) - and. r0,r11,r12 - beq L(loop) - - and r0,r7,r9 - add r0,r0,r7 - andc r8,r12,r0 - b L(done0) + lwz rWORD1, 4(rSTR) + lwzu rWORD2, 8(rSTR) + add rTMP1, rFEFE, rWORD1 + nor rTMP2, r7F7F, rWORD1 + and. rTMP1, rTMP1, rTMP2 + add rTMP3, rFEFE, rWORD2 + nor rTMP4, r7F7F, rWORD2 + bne L(done1) + and. rTMP1, rTMP3, rTMP4 + beq L(loop) + + and rTMP1, r7F7F, rWORD2 + add rTMP1, rTMP1, r7F7F + andc rWORD1, rTMP4, rTMP1 + b L(done0) L(done1): - and r0,r7,r8 - subi r4,r4,4 - add r0,r0,r7 - andc r8,r10,r0 + and rTMP1, r7F7F, rWORD1 + subi rSTR, rSTR, 4 + add rTMP1, rTMP1, r7F7F + andc rWORD1, rTMP2, rTMP1 -/* When we get to here, r4 points to the first word in the string that - contains a zero byte, and the most significant set bit in r8 is in that +/* When we get to here, rSTR points to the first word in the string that + contains a zero byte, and the most significant set bit in rWORD1 is in that byte. */ L(done0): - cntlzw r11,r8 - subf r0,r3,r4 - srwi r11,r11,3 - add r3,r0,r11 + cntlzw rTMP3, rWORD1 + subf rTMP1, rRTN, rSTR + srwi rTMP3, rTMP3, 3 + add rRTN, rTMP1, rTMP3 blr -END(strlen) +END (strlen) |