diff options
Diffstat (limited to 'sysdeps/alpha')
-rw-r--r-- | sysdeps/alpha/alphaev6/stxncpy.S | 85 | ||||
-rw-r--r-- | sysdeps/alpha/stxncpy.S | 72 |
2 files changed, 85 insertions, 72 deletions
diff --git a/sysdeps/alpha/alphaev6/stxncpy.S b/sysdeps/alpha/alphaev6/stxncpy.S index 21e94ba7e2..f39c23a886 100644 --- a/sysdeps/alpha/alphaev6/stxncpy.S +++ b/sysdeps/alpha/alphaev6/stxncpy.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2000 Free Software Foundation, Inc. +/* Copyright (C) 2000, 2002 Free Software Foundation, Inc. Contributed by Richard Henderson (rth@tamu.edu) EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. This file is part of the GNU C Library. @@ -210,35 +210,30 @@ $u_head: cmpbge zero, t6, t7 # E : beq a2, $u_eocfin # U : - nop + lda t6, -1 # E : nop bne t7, $u_final # U : - lda t6, -1 # E : mask out the bits we have - mskql t6, a1, t6 # U : already seen (stall) + mskql t6, a1, t6 # U : mask out bits already seen stq_u t0, 0(a0) # L : store first output word + or t6, t2, t2 # E : - or t6, t2, t2 # E : - cmpbge zero, t2, t7 # E : find nulls in second partial (stall) - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - + cmpbge zero, t2, t7 # E : find nulls in second partial + addq a0, 8, a0 # E : + subq a2, 1, a2 # E : bne t7, $u_late_head_exit # U : + /* Finally, we've got all the stupid leading edge cases taken care of and we can set up to enter the main loop. */ extql t2, a1, t1 # U : position hi-bits of lo word + beq a2, $u_eoc # U : ldq_u t2, 8(a1) # L : read next high-order source word addq a1, 8, a1 # E : - cmpbge zero, t2, t7 # E : (stall) - beq a2, $u_eoc # U : - nop - nop - - bne t7, $u_eos # e1 : - nop - nop - nop + extqh t2, a1, t0 # U : position lo-bits of hi word (stall) + cmpbge zero, t2, t7 # E : + nop + bne t7, $u_eos # U : /* Unaligned copy main loop. In order to avoid reading too much, the loop is structured to detect zeros in aligned source words. @@ -248,6 +243,7 @@ $u_head: to run as fast as possible. On entry to this basic block: + t0 == the shifted low-order bits from the current source word t1 == the shifted high-order bits from the previous source word t2 == the unshifted current source word @@ -255,25 +251,20 @@ $u_head: .align 4 $u_loop: - extqh t2, a1, t0 # U : extract high bits for current word - addq a1, 8, a1 # E : - extql t2, a1, t3 # U : extract low bits for next time + or t0, t1, t0 # E : current dst word now complete + subq a2, 1, a2 # E : decrement word count + extql t2, a1, t1 # U : extract high bits for next time addq a0, 8, a0 # E : - or t0, t1, t0 # E : current dst word now complete - ldq_u t2, 0(a1) # U : Latency=3 load high word for next time - stq_u t0, -8(a0) # U : save the current word (stall) - mov t3, t1 # E : + stq_u t0, -8(a0) # L : save the current word + beq a2, $u_eoc # U : + ldq_u t2, 8(a1) # L : Latency=3 load high word for next time + addq a1, 8, a1 # E : - subq a2, 1, a2 # E : - cmpbge zero, t2, t7 # E : test new word for eos (2 cycle stall for data) - beq a2, $u_eoc # U : (stall) + extqh t2, a1, t0 # U : extract low bits (2 cycle stall) + cmpbge zero, t2, t7 # E : test new word for eos nop - beq t7, $u_loop # U : - nop - nop - nop /* We've found a zero somewhere in the source word we just read. If it resides in the lower half, we have one (probably partial) @@ -281,11 +272,12 @@ $u_loop: have one full and one partial word left to write out. On entry to this basic block: + t0 == the shifted low-order bits from the current source word t1 == the shifted high-order bits from the previous source word t2 == the unshifted current source word. */ $u_eos: - extqh t2, a1, t0 # U : - or t0, t1, t0 # E : first (partial) source word complete (stall) + or t0, t1, t0 # E : first (partial) source word complete + nop cmpbge zero, t0, t7 # E : is the null in this first bit? (stall) bne t7, $u_final # U : (stall) @@ -323,17 +315,26 @@ $u_final: 1: stq_u t0, 0(a0) # L : ret (t9) # L0 : Latency=3 -$u_eoc: # end-of-count - extqh t2, a1, t0 # U : - or t0, t1, t0 # E : (stall) - cmpbge zero, t0, t7 # E : (stall) + /* Got to end-of-count before end of string. + On entry to this basic block: + t1 == the shifted high-order bits from the previous source word */ +$u_eoc: + and a1, 7, t6 # E : + sll t10, t6, t6 # U : (stall) + and t6, 0xff, t6 # E : (stall) + bne t6, 1f # U : (stall) + + ldq_u t2, 8(a1) # L : load final src word nop + extqh t2, a1, t0 # U : extract low bits for last word (stall) + or t1, t0, t1 # E : (stall) + +1: cmpbge zero, t1, t7 # E : + mov t1, t0 $u_eocfin: # end-of-count, final word or t10, t7, t7 # E : br $u_final # L0 : Latency=3 - nop - nop /* Unaligned copy entry point. */ .align 4 @@ -354,9 +355,7 @@ $unaligned: mskql t6, a0, t6 # U : nop nop - nop -1: - subq a1, t4, a1 # E : sub dest misalignment from src addr +1: subq a1, t4, a1 # E : sub dest misalignment from src addr /* If source misalignment is larger than dest misalignment, we need extra startup checks to avoid SEGV. */ diff --git a/sysdeps/alpha/stxncpy.S b/sysdeps/alpha/stxncpy.S index 9330f6d3e6..73bcd36e47 100644 --- a/sysdeps/alpha/stxncpy.S +++ b/sysdeps/alpha/stxncpy.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1996, 1997 Free Software Foundation, Inc. +/* Copyright (C) 1996, 1997, 2002 Free Software Foundation, Inc. Contributed by Richard Henderson (rth@tamu.edu) This file is part of the GNU C Library. @@ -183,10 +183,11 @@ $u_head: or t0, t6, t6 # e1 : mask original data for zero test cmpbge zero, t6, t7 # e0 : beq a2, $u_eocfin # .. e1 : - bne t7, $u_final # e1 : + lda t6, -1 # e0 : + bne t7, $u_final # .. e1 : - lda t6, -1 # e1 : mask out the bits we have - mskql t6, a1, t6 # e0 : already seen + mskql t6, a1, t6 # e0 : mask out bits already seen + nop # .. e1 : stq_u t0, 0(a0) # e0 : store first output word or t6, t2, t2 # .. e1 : cmpbge zero, t2, t7 # e0 : find nulls in second partial @@ -198,11 +199,13 @@ $u_head: of and we can set up to enter the main loop. */ extql t2, a1, t1 # e0 : position hi-bits of lo word - ldq_u t2, 8(a1) # .. e1 : read next high-order source word - addq a1, 8, a1 # e0 : - cmpbge zero, t2, t7 # e1 (stall) - beq a2, $u_eoc # e1 : - bne t7, $u_eos # e1 : + beq a2, $u_eoc # .. e1 : + ldq_u t2, 8(a1) # e0 : read next high-order source word + addq a1, 8, a1 # .. e1 : + extqh t2, a1, t0 # e0 : position lo-bits of hi word + cmpbge zero, t2, t7 # .. e1 : test new word for eos + nop # e0 : + bne t7, $u_eos # .. e1 : /* Unaligned copy main loop. In order to avoid reading too much, the loop is structured to detect zeros in aligned source words. @@ -212,6 +215,7 @@ $u_head: to run as fast as possible. On entry to this basic block: + t0 == the shifted low-order bits from the current source word t1 == the shifted high-order bits from the previous source word t2 == the unshifted current source word @@ -219,18 +223,18 @@ $u_head: .align 3 $u_loop: - extqh t2, a1, t0 # e0 : extract high bits for current word - addq a1, 8, a1 # .. e1 : - extql t2, a1, t3 # e0 : extract low bits for next time - addq a0, 8, a0 # .. e1 : or t0, t1, t0 # e0 : current dst word now complete - ldq_u t2, 0(a1) # .. e1 : load high word for next time - stq_u t0, -8(a0) # e0 : save the current word - mov t3, t1 # .. e1 : - subq a2, 1, a2 # e0 : + subq a2, 1, a2 # .. e1 : decrement word count + stq_u t0, 0(a0) # e0 : save the current word + addq a0, 8, a0 # .. e1 : + extql t2, a1, t1 # e0 : extract high bits for next time + beq a2, $u_eoc # .. e1 : + ldq_u t2, 8(a1) # e0 : load high word for next time + addq a1, 8, a1 # .. e1 : + nop # e0 : cmpbge zero, t2, t7 # .. e1 : test new word for eos - beq a2, $u_eoc # e1 : - beq t7, $u_loop # e1 : + extqh t2, a1, t0 # e0 : extract low bits for current word + beq t7, $u_loop # .. e1 : /* We've found a zero somewhere in the source word we just read. If it resides in the lower half, we have one (probably partial) @@ -238,25 +242,23 @@ $u_loop: have one full and one partial word left to write out. On entry to this basic block: + t0 == the shifted low-order bits from the current source word t1 == the shifted high-order bits from the previous source word t2 == the unshifted current source word. */ $u_eos: - extqh t2, a1, t0 # e0 : - or t0, t1, t0 # e1 : first (partial) source word complete - + or t0, t1, t0 # e0 : first (partial) source word complete cmpbge zero, t0, t7 # e0 : is the null in this first bit? bne t7, $u_final # .. e1 (zdb) stq_u t0, 0(a0) # e0 : the null was in the high-order bits addq a0, 8, a0 # .. e1 : - subq a2, 1, a2 # e1 : + subq a2, 1, a2 # e0 : $u_late_head_exit: - extql t2, a1, t0 # .. e0 : + extql t2, a1, t0 # e0 : cmpbge zero, t0, t7 # e0 : or t7, t10, t6 # e1 : cmoveq a2, t6, t7 # e0 : - nop # .. e1 : /* Take care of a final (probably partial) result word. On entry to this basic block: @@ -279,10 +281,22 @@ $u_final: 1: stq_u t0, 0(a0) # e0 : ret (t9) # .. e1 : -$u_eoc: # end-of-count - extqh t2, a1, t0 - or t0, t1, t0 - cmpbge zero, t0, t7 + /* Got to end-of-count before end of string. + On entry to this basic block: + t1 == the shifted high-order bits from the previous source word */ +$u_eoc: + and a1, 7, t6 # e1 : + sll t10, t6, t6 # e0 : + and t6, 0xff, t6 # e0 : + bne t6, 1f # e1 : avoid src word load if we can + + ldq_u t2, 8(a1) # e0 : load final src word + nop # .. e1 : + extqh t2, a1, t0 # e0 : extract high bits for last word + or t1, t0, t1 # e1 : + +1: cmpbge zero, t1, t7 + mov t1, t0 $u_eocfin: # end-of-count, final word or t10, t7, t7 |