diff options
author | Zack Weinberg <zackw@panix.com> | 2017-06-08 15:39:03 -0400 |
---|---|---|
committer | Zack Weinberg <zackw@panix.com> | 2017-06-08 15:39:03 -0400 |
commit | 5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch) | |
tree | 4470480d904b65cf14ca524f96f79eca818c3eaf /sysdeps/alpha/alphaev6 | |
parent | 199fc19d3aaaf57944ef036e15904febe877fc93 (diff) | |
download | glibc-zack/build-layout-experiment.tar glibc-zack/build-layout-experiment.tar.gz glibc-zack/build-layout-experiment.tar.bz2 glibc-zack/build-layout-experiment.zip |
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage
directory, REORG.TODO, except for files that will certainly still
exist in their current form at top level when we're done (COPYING,
COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which
are moved to the new directory OldChangeLogs, instead), and the
generated file INSTALL (which is just deleted; in the new order, there
will be no generated files checked into version control).
Diffstat (limited to 'sysdeps/alpha/alphaev6')
-rw-r--r-- | sysdeps/alpha/alphaev6/Implies | 1 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/addmul_1.S | 477 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/fpu/e_sqrt.S | 53 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/fpu/e_sqrtf.S | 53 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/memcpy.S | 255 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/memset.S | 223 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/stxcpy.S | 314 | ||||
-rw-r--r-- | sysdeps/alpha/alphaev6/stxncpy.S | 392 |
8 files changed, 0 insertions, 1768 deletions
diff --git a/sysdeps/alpha/alphaev6/Implies b/sysdeps/alpha/alphaev6/Implies deleted file mode 100644 index 0e7fc170ba..0000000000 --- a/sysdeps/alpha/alphaev6/Implies +++ /dev/null @@ -1 +0,0 @@ -alpha/alphaev5 diff --git a/sysdeps/alpha/alphaev6/addmul_1.S b/sysdeps/alpha/alphaev6/addmul_1.S deleted file mode 100644 index 1072ea763f..0000000000 --- a/sysdeps/alpha/alphaev6/addmul_1.S +++ /dev/null @@ -1,477 +0,0 @@ - # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add - # the result to a second limb vector. - # - # Copyright (C) 2000-2017 Free Software Foundation, Inc. - # - # This file is part of the GNU MP Library. - # - # The GNU MP Library is free software; you can redistribute it and/or modify - # it under the terms of the GNU Lesser General Public License as published - # by the Free Software Foundation; either version 2.1 of the License, or (at - # your option) any later version. - # - # The GNU MP Library is distributed in the hope that it will be useful, but - # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public - # License for more details. - # - # You should have received a copy of the GNU Lesser General Public License - # along with the GNU MP Library. If not, see <http://www.gnu.org/licenses/>. - - # INPUT PARAMETERS - # res_ptr $16 - # s1_ptr $17 - # size $18 - # s2_limb $19 - # - # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and - # exactly 3.625 cycles/limb on EV6... - # - # This code was written in close cooperation with ev6 pipeline expert - # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though. - # - # Register usages for unrolled loop: - # 0-3 mul's - # 4-7 acc's - # 8-15 mul results - # 20,21 carry's - # 22,23 save for stores - # - # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop. - # - # The stores can issue a cycle late so we have paired no-op's to 'catch' - # them, so that further disturbance to the schedule is damped. - # - # We couldn't pair the loads, because the entangled schedule of the - # carry's has to happen on one side {0} of the machine. Note, the total - # use of U0, and the total use of L0 (after attending to the stores). - # which is part of the reason why.... - # - # This is a great schedule for the d_cache, a poor schedule for the - # b_cache. The lockup on U0 means that any stall can't be recovered - # from. Consider a ldq in L1. say that load gets stalled because it - # collides with a fill from the b_Cache. On the next cycle, this load - # gets priority. If first looks at L0, and goes there. The instruction - # we intended for L0 gets to look at L1, which is NOT where we want - # it. It either stalls 1, because it can't go in L0, or goes there, and - # causes a further instruction to stall. - # - # So for b_cache, we're likely going to want to put one or more cycles - # back into the code! And, of course, put in prefetches. For the - # accumulator, lds, intent to modify. For the multiplier, you might - # want ldq, evict next, if you're not wanting to use it again soon. Use - # 256 ahead of present pointer value. At a place where we have an mt - # followed by a bookkeeping, put the bookkeeping in upper, and the - # prefetch into lower. - # - # Note, the usage of physical registers per cycle is smoothed off, as - # much as possible. - # - # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd - # like not to have a ldq or stq to preceded a conditional branch in a - # quadpack. The conditional branch moves the retire pointer one cycle - # later. - # - # Optimization notes: - # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27? - # Reserved regs: $29 $30 $31 - # Free caller-saves regs in unrolled code: $24 $25 $28 - # We should swap some of the callee-saves regs for some of the free - # caller-saves regs, saving some overhead cycles. - # Most importantly, we should write fast code for the 0-7 case. - # The code we use there are for the 21164, and runs at 7 cycles/limb - # on the 21264. Should not be hard, if we write specialized code for - # 1-7 limbs (the one for 0 limbs should be straightforward). We then just - # need a jump table indexed by the low 3 bits of the count argument. - - .set noreorder - .set noat - .text - - .globl __mpn_addmul_1 - .ent __mpn_addmul_1 -__mpn_addmul_1: - .frame $30,0,$26,0 - .prologue 0 - - cmpult $18, 8, $1 - beq $1, $Large - - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - subq $18, 1, $18 # size-- - mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - umulh $2, $19, $0 # $0 = prod_high - beq $18, $Lend0b # jump if size was == 1 - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - subq $18, 1, $18 # size-- - addq $5, $3, $3 - cmpult $3, $5, $4 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - beq $18, $Lend0a # jump if size was == 2 - - .align 3 -$Loop0: mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - addq $4, $0, $0 # cy_limb = cy_limb + 'cy' - subq $18, 1, $18 # size-- - umulh $2, $19, $4 # $4 = cy_limb - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - addq $3, $0, $3 # $3 = cy_limb + prod_low - cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - addq $5, $0, $0 # combine carries - bne $18, $Loop0 -$Lend0a: - mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - addq $4, $0, $0 # cy_limb = cy_limb + 'cy' - umulh $2, $19, $4 # $4 = cy_limb - addq $3, $0, $3 # $3 = cy_limb + prod_low - cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $5, $0, $0 # combine carries - addq $4, $0, $0 # cy_limb = prod_high + cy - ret $31, ($26), 1 -$Lend0b: - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $0, $5, $0 - ret $31, ($26), 1 - -$Large: - lda $30, -240($30) - stq $9, 8($30) - stq $10, 16($30) - stq $11, 24($30) - stq $12, 32($30) - stq $13, 40($30) - stq $14, 48($30) - stq $15, 56($30) - - and $18, 7, $20 # count for the first loop, 0-7 - srl $18, 3, $18 # count for unrolled loop - bis $31, $31, $0 - beq $20, $Lunroll - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - subq $20, 1, $20 # size-- - mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - umulh $2, $19, $0 # $0 = prod_high - beq $20, $Lend1b # jump if size was == 1 - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - subq $20, 1, $20 # size-- - addq $5, $3, $3 - cmpult $3, $5, $4 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - beq $20, $Lend1a # jump if size was == 2 - - .align 3 -$Loop1: mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - addq $4, $0, $0 # cy_limb = cy_limb + 'cy' - subq $20, 1, $20 # size-- - umulh $2, $19, $4 # $4 = cy_limb - ldq $2, 0($17) # $2 = s1_limb - addq $17, 8, $17 # s1_ptr++ - addq $3, $0, $3 # $3 = cy_limb + prod_low - cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - addq $5, $0, $0 # combine carries - bne $20, $Loop1 - -$Lend1a: - mulq $2, $19, $3 # $3 = prod_low - ldq $5, 0($16) # $5 = *res_ptr - addq $4, $0, $0 # cy_limb = cy_limb + 'cy' - umulh $2, $19, $4 # $4 = cy_limb - addq $3, $0, $3 # $3 = cy_limb + prod_low - cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low) - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - addq $5, $0, $0 # combine carries - addq $4, $0, $0 # cy_limb = prod_high + cy - br $31, $Lunroll -$Lend1b: - addq $5, $3, $3 - cmpult $3, $5, $5 - stq $3, 0($16) - addq $16, 8, $16 # res_ptr++ - addq $0, $5, $0 - -$Lunroll: - lda $17, -16($17) # L1 bookkeeping - lda $16, -16($16) # L1 bookkeeping - bis $0, $31, $12 - - # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____ - - ldq $2, 16($17) # L1 - ldq $3, 24($17) # L1 - lda $18, -1($18) # L1 bookkeeping - ldq $6, 16($16) # L1 - ldq $7, 24($16) # L1 - ldq $0, 32($17) # L1 - mulq $19, $2, $13 # U1 - ldq $1, 40($17) # L1 - umulh $19, $2, $14 # U1 - mulq $19, $3, $15 # U1 - lda $17, 64($17) # L1 bookkeeping - ldq $4, 32($16) # L1 - ldq $5, 40($16) # L1 - umulh $19, $3, $8 # U1 - ldq $2, -16($17) # L1 - mulq $19, $0, $9 # U1 - ldq $3, -8($17) # L1 - umulh $19, $0, $10 # U1 - addq $6, $13, $6 # L0 lo + acc - mulq $19, $1, $11 # U1 - cmpult $6, $13, $20 # L0 lo add => carry - lda $16, 64($16) # L1 bookkeeping - addq $6, $12, $22 # U0 hi add => answer - cmpult $22, $12, $21 # L0 hi add => carry - addq $14, $20, $14 # U0 hi mul + carry - ldq $6, -16($16) # L1 - addq $7, $15, $23 # L0 lo + acc - addq $14, $21, $14 # U0 hi mul + carry - ldq $7, -8($16) # L1 - umulh $19, $1, $12 # U1 - cmpult $23, $15, $20 # L0 lo add => carry - addq $23, $14, $23 # U0 hi add => answer - ldq $0, 0($17) # L1 - mulq $19, $2, $13 # U1 - cmpult $23, $14, $21 # L0 hi add => carry - addq $8, $20, $8 # U0 hi mul + carry - ldq $1, 8($17) # L1 - umulh $19, $2, $14 # U1 - addq $4, $9, $4 # L0 lo + acc - stq $22, -48($16) # L0 - stq $23, -40($16) # L1 - mulq $19, $3, $15 # U1 - addq $8, $21, $8 # U0 hi mul + carry - cmpult $4, $9, $20 # L0 lo add => carry - addq $4, $8, $22 # U0 hi add => answer - ble $18, $Lend # U1 bookkeeping - - # ____ MAIN UNROLLED LOOP ____ - .align 4 -$Loop: - bis $31, $31, $31 # U1 mt - cmpult $22, $8, $21 # L0 hi add => carry - addq $10, $20, $10 # U0 hi mul + carry - ldq $4, 0($16) # L1 - - bis $31, $31, $31 # U1 mt - addq $5, $11, $23 # L0 lo + acc - addq $10, $21, $10 # L0 hi mul + carry - ldq $5, 8($16) # L1 - - umulh $19, $3, $8 # U1 - cmpult $23, $11, $20 # L0 lo add => carry - addq $23, $10, $23 # U0 hi add => answer - ldq $2, 16($17) # L1 - - mulq $19, $0, $9 # U1 - cmpult $23, $10, $21 # L0 hi add => carry - addq $12, $20, $12 # U0 hi mul + carry - ldq $3, 24($17) # L1 - - umulh $19, $0, $10 # U1 - addq $6, $13, $6 # L0 lo + acc - stq $22, -32($16) # L0 - stq $23, -24($16) # L1 - - bis $31, $31, $31 # L0 st slosh - mulq $19, $1, $11 # U1 - bis $31, $31, $31 # L1 st slosh - addq $12, $21, $12 # U0 hi mul + carry - - cmpult $6, $13, $20 # L0 lo add => carry - bis $31, $31, $31 # U1 mt - lda $18, -1($18) # L1 bookkeeping - addq $6, $12, $22 # U0 hi add => answer - - bis $31, $31, $31 # U1 mt - cmpult $22, $12, $21 # L0 hi add => carry - addq $14, $20, $14 # U0 hi mul + carry - ldq $6, 16($16) # L1 - - bis $31, $31, $31 # U1 mt - addq $7, $15, $23 # L0 lo + acc - addq $14, $21, $14 # U0 hi mul + carry - ldq $7, 24($16) # L1 - - umulh $19, $1, $12 # U1 - cmpult $23, $15, $20 # L0 lo add => carry - addq $23, $14, $23 # U0 hi add => answer - ldq $0, 32($17) # L1 - - mulq $19, $2, $13 # U1 - cmpult $23, $14, $21 # L0 hi add => carry - addq $8, $20, $8 # U0 hi mul + carry - ldq $1, 40($17) # L1 - - umulh $19, $2, $14 # U1 - addq $4, $9, $4 # U0 lo + acc - stq $22, -16($16) # L0 - stq $23, -8($16) # L1 - - bis $31, $31, $31 # L0 st slosh - mulq $19, $3, $15 # U1 - bis $31, $31, $31 # L1 st slosh - addq $8, $21, $8 # L0 hi mul + carry - - cmpult $4, $9, $20 # L0 lo add => carry - bis $31, $31, $31 # U1 mt - lda $17, 64($17) # L1 bookkeeping - addq $4, $8, $22 # U0 hi add => answer - - bis $31, $31, $31 # U1 mt - cmpult $22, $8, $21 # L0 hi add => carry - addq $10, $20, $10 # U0 hi mul + carry - ldq $4, 32($16) # L1 - - bis $31, $31, $31 # U1 mt - addq $5, $11, $23 # L0 lo + acc - addq $10, $21, $10 # L0 hi mul + carry - ldq $5, 40($16) # L1 - - umulh $19, $3, $8 # U1 - cmpult $23, $11, $20 # L0 lo add => carry - addq $23, $10, $23 # U0 hi add => answer - ldq $2, -16($17) # L1 - - mulq $19, $0, $9 # U1 - cmpult $23, $10, $21 # L0 hi add => carry - addq $12, $20, $12 # U0 hi mul + carry - ldq $3, -8($17) # L1 - - umulh $19, $0, $10 # U1 - addq $6, $13, $6 # L0 lo + acc - stq $22, 0($16) # L0 - stq $23, 8($16) # L1 - - bis $31, $31, $31 # L0 st slosh - mulq $19, $1, $11 # U1 - bis $31, $31, $31 # L1 st slosh - addq $12, $21, $12 # U0 hi mul + carry - - cmpult $6, $13, $20 # L0 lo add => carry - bis $31, $31, $31 # U1 mt - lda $16, 64($16) # L1 bookkeeping - addq $6, $12, $22 # U0 hi add => answer - - bis $31, $31, $31 # U1 mt - cmpult $22, $12, $21 # L0 hi add => carry - addq $14, $20, $14 # U0 hi mul + carry - ldq $6, -16($16) # L1 - - bis $31, $31, $31 # U1 mt - addq $7, $15, $23 # L0 lo + acc - addq $14, $21, $14 # U0 hi mul + carry - ldq $7, -8($16) # L1 - - umulh $19, $1, $12 # U1 - cmpult $23, $15, $20 # L0 lo add => carry - addq $23, $14, $23 # U0 hi add => answer - ldq $0, 0($17) # L1 - - mulq $19, $2, $13 # U1 - cmpult $23, $14, $21 # L0 hi add => carry - addq $8, $20, $8 # U0 hi mul + carry - ldq $1, 8($17) # L1 - - umulh $19, $2, $14 # U1 - addq $4, $9, $4 # L0 lo + acc - stq $22, -48($16) # L0 - stq $23, -40($16) # L1 - - bis $31, $31, $31 # L0 st slosh - mulq $19, $3, $15 # U1 - bis $31, $31, $31 # L1 st slosh - addq $8, $21, $8 # U0 hi mul + carry - - cmpult $4, $9, $20 # L0 lo add => carry - addq $4, $8, $22 # U0 hi add => answer - bis $31, $31, $31 # L1 mt - bgt $18, $Loop # U1 bookkeeping - -# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____ -$Lend: - cmpult $22, $8, $21 # L0 hi add => carry - addq $10, $20, $10 # U0 hi mul + carry - ldq $4, 0($16) # L1 - addq $5, $11, $23 # L0 lo + acc - addq $10, $21, $10 # L0 hi mul + carry - ldq $5, 8($16) # L1 - umulh $19, $3, $8 # U1 - cmpult $23, $11, $20 # L0 lo add => carry - addq $23, $10, $23 # U0 hi add => answer - mulq $19, $0, $9 # U1 - cmpult $23, $10, $21 # L0 hi add => carry - addq $12, $20, $12 # U0 hi mul + carry - umulh $19, $0, $10 # U1 - addq $6, $13, $6 # L0 lo + acc - stq $22, -32($16) # L0 - stq $23, -24($16) # L1 - mulq $19, $1, $11 # U1 - addq $12, $21, $12 # U0 hi mul + carry - cmpult $6, $13, $20 # L0 lo add => carry - addq $6, $12, $22 # U0 hi add => answer - cmpult $22, $12, $21 # L0 hi add => carry - addq $14, $20, $14 # U0 hi mul + carry - addq $7, $15, $23 # L0 lo + acc - addq $14, $21, $14 # U0 hi mul + carry - umulh $19, $1, $12 # U1 - cmpult $23, $15, $20 # L0 lo add => carry - addq $23, $14, $23 # U0 hi add => answer - cmpult $23, $14, $21 # L0 hi add => carry - addq $8, $20, $8 # U0 hi mul + carry - addq $4, $9, $4 # U0 lo + acc - stq $22, -16($16) # L0 - stq $23, -8($16) # L1 - bis $31, $31, $31 # L0 st slosh - addq $8, $21, $8 # L0 hi mul + carry - cmpult $4, $9, $20 # L0 lo add => carry - addq $4, $8, $22 # U0 hi add => answer - cmpult $22, $8, $21 # L0 hi add => carry - addq $10, $20, $10 # U0 hi mul + carry - addq $5, $11, $23 # L0 lo + acc - addq $10, $21, $10 # L0 hi mul + carry - cmpult $23, $11, $20 # L0 lo add => carry - addq $23, $10, $23 # U0 hi add => answer - cmpult $23, $10, $21 # L0 hi add => carry - addq $12, $20, $12 # U0 hi mul + carry - stq $22, 0($16) # L0 - stq $23, 8($16) # L1 - addq $12, $21, $0 # U0 hi mul + carry - - ldq $9, 8($30) - ldq $10, 16($30) - ldq $11, 24($30) - ldq $12, 32($30) - ldq $13, 40($30) - ldq $14, 48($30) - ldq $15, 56($30) - lda $30, 240($30) - ret $31, ($26), 1 - - .end __mpn_addmul_1 diff --git a/sysdeps/alpha/alphaev6/fpu/e_sqrt.S b/sysdeps/alpha/alphaev6/fpu/e_sqrt.S deleted file mode 100644 index 18d03ee9c9..0000000000 --- a/sysdeps/alpha/alphaev6/fpu/e_sqrt.S +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <shlib-compat.h> - - .arch ev6 - .set noreorder - .set noat - -ENTRY(__ieee754_sqrt) -#ifdef PROF - ldgp gp, 0(pv) - lda AT, _mcount - jsr AT, (AT), _mcount - .prologue 1 -#else - .prologue 0 -#endif - - .align 4 -#ifdef _IEEE_FP_INEXACT - sqrtt/suid $f16, $f0 -#else - sqrtt/sud $f16, $f0 -#endif - ret - nop - nop - -END(__ieee754_sqrt) - -#if SHLIB_COMPAT (libm, GLIBC_2_15, GLIBC_2_18) -strong_alias(__ieee754_sqrt, __sqrt_finite1) -compat_symbol(libm, __sqrt_finite1, __sqrt_finite, GLIBC_2_15) -versioned_symbol(libm, __ieee754_sqrt, __sqrt_finite, GLIBC_2_18) -#else -strong_alias(__ieee754_sqrt, __sqrt_finite) -#endif diff --git a/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S b/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S deleted file mode 100644 index c4ef9c32c6..0000000000 --- a/sysdeps/alpha/alphaev6/fpu/e_sqrtf.S +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <shlib-compat.h> - - .arch ev6 - .set noreorder - .set noat - -ENTRY(__ieee754_sqrtf) -#ifdef PROF - ldgp gp, 0(pv) - lda AT, _mcount - jsr AT, (AT), _mcount - .prologue 1 -#else - .prologue 0 -#endif - - .align 4 -#ifdef _IEEE_FP_INEXACT - sqrts/suid $f16, $f0 -#else - sqrts/sud $f16, $f0 -#endif - ret - nop - nop - -END(__ieee754_sqrtf) - -#if SHLIB_COMPAT (libm, GLIBC_2_15, GLIBC_2_18) -strong_alias(__ieee754_sqrtf, __sqrtf_finite1) -compat_symbol(libm, __sqrtf_finite1, __sqrtf_finite, GLIBC_2_15) -versioned_symbol(libm, __ieee754_sqrtf, __sqrtf_finite, GLIBC_2_18) -#else -strong_alias(__ieee754_sqrtf, __sqrtf_finite) -#endif diff --git a/sysdeps/alpha/alphaev6/memcpy.S b/sysdeps/alpha/alphaev6/memcpy.S deleted file mode 100644 index 170a23b5da..0000000000 --- a/sysdeps/alpha/alphaev6/memcpy.S +++ /dev/null @@ -1,255 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - This file is part of the GNU C Library. - EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -/* - * Much of the information about 21264 scheduling/coding comes from: - * Compiler Writer's Guide for the Alpha 21264 - * abbreviated as 'CWG' in other comments here - * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html - * Scheduling notation: - * E - either cluster - * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 - * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 - * - * Temp usage notes: - * $0 - destination address - * $1,$2, - scratch - */ - -#include <sysdep.h> - - .arch ev6 - .set noreorder - .set noat - -ENTRY(memcpy) - .prologue 0 - - mov $16, $0 # E : copy dest to return - ble $18, $nomoredata # U : done with the copy? - xor $16, $17, $1 # E : are source and dest alignments the same? - and $1, 7, $1 # E : are they the same mod 8? - - bne $1, $misaligned # U : Nope - gotta do this the slow way - /* source and dest are same mod 8 address */ - and $16, 7, $1 # E : Are both 0mod8? - beq $1, $both_0mod8 # U : Yes - nop # E : - - /* - * source and dest are same misalignment. move a byte at a time - * until a 0mod8 alignment for both is reached. - * At least one byte more to move - */ - -$head_align: - ldbu $1, 0($17) # L : grab a byte - subq $18, 1, $18 # E : count-- - addq $17, 1, $17 # E : src++ - stb $1, 0($16) # L : - addq $16, 1, $16 # E : dest++ - and $16, 7, $1 # E : Are we at 0mod8 yet? - ble $18, $nomoredata # U : done with the copy? - bne $1, $head_align # U : - -$both_0mod8: - cmple $18, 127, $1 # E : Can we unroll the loop? - bne $1, $no_unroll # U : - and $16, 63, $1 # E : get mod64 alignment - beq $1, $do_unroll # U : no single quads to fiddle - -$single_head_quad: - ldq $1, 0($17) # L : get 8 bytes - subq $18, 8, $18 # E : count -= 8 - addq $17, 8, $17 # E : src += 8 - nop # E : - - stq $1, 0($16) # L : store - addq $16, 8, $16 # E : dest += 8 - and $16, 63, $1 # E : get mod64 alignment - bne $1, $single_head_quad # U : still not fully aligned - -$do_unroll: - addq $16, 64, $7 # E : Initial (+1 trip) wh64 address - cmple $18, 127, $1 # E : Can we go through the unrolled loop? - bne $1, $tail_quads # U : Nope - nop # E : - -$unroll_body: - wh64 ($7) # L1 : memory subsystem hint: 64 bytes at - # ($7) are about to be over-written - ldq $6, 0($17) # L0 : bytes 0..7 - nop # E : - nop # E : - - ldq $4, 8($17) # L : bytes 8..15 - ldq $5, 16($17) # L : bytes 16..23 - addq $7, 64, $7 # E : Update next wh64 address - nop # E : - - ldq $3, 24($17) # L : bytes 24..31 - addq $16, 64, $1 # E : fallback value for wh64 - nop # E : - nop # E : - - addq $17, 32, $17 # E : src += 32 bytes - stq $6, 0($16) # L : bytes 0..7 - nop # E : - nop # E : - - stq $4, 8($16) # L : bytes 8..15 - stq $5, 16($16) # L : bytes 16..23 - subq $18, 192, $2 # E : At least two more trips to go? - nop # E : - - stq $3, 24($16) # L : bytes 24..31 - addq $16, 32, $16 # E : dest += 32 bytes - nop # E : - nop # E : - - ldq $6, 0($17) # L : bytes 0..7 - ldq $4, 8($17) # L : bytes 8..15 - cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use - # fallback wh64 address if < 2 more trips - nop # E : - - ldq $5, 16($17) # L : bytes 16..23 - ldq $3, 24($17) # L : bytes 24..31 - addq $16, 32, $16 # E : dest += 32 - subq $18, 64, $18 # E : count -= 64 - - addq $17, 32, $17 # E : src += 32 - stq $6, -32($16) # L : bytes 0..7 - stq $4, -24($16) # L : bytes 8..15 - cmple $18, 63, $1 # E : At least one more trip? - - stq $5, -16($16) # L : bytes 16..23 - stq $3, -8($16) # L : bytes 24..31 - nop # E : - beq $1, $unroll_body - -$tail_quads: -$no_unroll: - .align 4 - subq $18, 8, $18 # E : At least a quad left? - blt $18, $less_than_8 # U : Nope - nop # E : - nop # E : - -$move_a_quad: - ldq $1, 0($17) # L : fetch 8 - subq $18, 8, $18 # E : count -= 8 - addq $17, 8, $17 # E : src += 8 - nop # E : - - stq $1, 0($16) # L : store 8 - addq $16, 8, $16 # E : dest += 8 - bge $18, $move_a_quad # U : - nop # E : - -$less_than_8: - .align 4 - addq $18, 8, $18 # E : add back for trailing bytes - ble $18, $nomoredata # U : All-done - nop # E : - nop # E : - - /* Trailing bytes */ -$tail_bytes: - subq $18, 1, $18 # E : count-- - ldbu $1, 0($17) # L : fetch a byte - addq $17, 1, $17 # E : src++ - nop # E : - - stb $1, 0($16) # L : store a byte - addq $16, 1, $16 # E : dest++ - bgt $18, $tail_bytes # U : more to be done? - nop # E : - - /* branching to exit takes 3 extra cycles, so replicate exit here */ - ret $31, ($26), 1 # L0 : - nop # E : - nop # E : - nop # E : - -$misaligned: - mov $0, $4 # E : dest temp - and $0, 7, $1 # E : dest alignment mod8 - beq $1, $dest_0mod8 # U : life doesnt totally suck - nop - -$aligndest: - ble $18, $nomoredata # U : - ldbu $1, 0($17) # L : fetch a byte - subq $18, 1, $18 # E : count-- - addq $17, 1, $17 # E : src++ - - stb $1, 0($4) # L : store it - addq $4, 1, $4 # E : dest++ - and $4, 7, $1 # E : dest 0mod8 yet? - bne $1, $aligndest # U : go until we are aligned. - - /* Source has unknown alignment, but dest is known to be 0mod8 */ -$dest_0mod8: - subq $18, 8, $18 # E : At least a quad left? - blt $18, $misalign_tail # U : Nope - ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes - nop # E : - -$mis_quad: - ldq_u $16, 8($17) # L : Fetch next 8 - extql $3, $17, $3 # U : masking - extqh $16, $17, $1 # U : masking - bis $3, $1, $1 # E : merged bytes to store - - subq $18, 8, $18 # E : count -= 8 - addq $17, 8, $17 # E : src += 8 - stq $1, 0($4) # L : store 8 (aligned) - mov $16, $3 # E : "rotate" source data - - addq $4, 8, $4 # E : dest += 8 - bge $18, $mis_quad # U : More quads to move - nop - nop - -$misalign_tail: - addq $18, 8, $18 # E : account for tail stuff - ble $18, $nomoredata # U : - nop - nop - -$misalign_byte: - ldbu $1, 0($17) # L : fetch 1 - subq $18, 1, $18 # E : count-- - addq $17, 1, $17 # E : src++ - nop # E : - - stb $1, 0($4) # L : store - addq $4, 1, $4 # E : dest++ - bgt $18, $misalign_byte # U : more to go? - nop - - -$nomoredata: - ret $31, ($26), 1 # L0 : - nop # E : - nop # E : - nop # E : - -END(memcpy) -libc_hidden_builtin_def (memcpy) diff --git a/sysdeps/alpha/alphaev6/memset.S b/sysdeps/alpha/alphaev6/memset.S deleted file mode 100644 index 185821c7eb..0000000000 --- a/sysdeps/alpha/alphaev6/memset.S +++ /dev/null @@ -1,223 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - Contributed by Richard Henderson (rth@tamu.edu) - EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - - .arch ev6 - .set noat - .set noreorder - -ENTRY(memset) -#ifdef PROF - ldgp gp, 0(pv) - lda AT, _mcount - jsr AT, (AT), _mcount - .prologue 1 -#else - .prologue 0 -#endif - - /* - * Serious stalling happens. The only way to mitigate this is to - * undertake a major re-write to interleave the constant materialization - * with other parts of the fall-through code. This is important, even - * though it makes maintenance tougher. - * Do this later. - */ - and $17, 255, $1 # E : 00000000000000ch - insbl $17, 1, $2 # U : 000000000000ch00 - mov $16, $0 # E : return value - ble $18, $end # U : zero length requested? - - addq $18, $16, $6 # E : max address to write to - or $1, $2, $17 # E : 000000000000chch - insbl $1, 2, $3 # U : 0000000000ch0000 - insbl $1, 3, $4 # U : 00000000ch000000 - - or $3, $4, $3 # E : 00000000chch0000 - inswl $17, 4, $5 # U : 0000chch00000000 - xor $16, $6, $1 # E : will complete write be within one quadword? - inswl $17, 6, $2 # U : chch000000000000 - - or $17, $3, $17 # E : 00000000chchchch - or $2, $5, $2 # E : chchchch00000000 - bic $1, 7, $1 # E : fit within a single quadword? - and $16, 7, $3 # E : Target addr misalignment - - or $17, $2, $17 # E : chchchchchchchch - beq $1, $within_quad # U : - nop # E : - beq $3, $aligned # U : target is 0mod8 - - /* - * Target address is misaligned, and won't fit within a quadword. - */ - ldq_u $4, 0($16) # L : Fetch first partial - mov $16, $5 # E : Save the address - insql $17, $16, $2 # U : Insert new bytes - subq $3, 8, $3 # E : Invert (for addressing uses) - - addq $18, $3, $18 # E : $18 is new count ($3 is negative) - mskql $4, $16, $4 # U : clear relevant parts of the quad - subq $16, $3, $16 # E : $16 is new aligned destination - or $2, $4, $1 # E : Final bytes - - nop - stq_u $1,0($5) # L : Store result - nop - nop - - .align 4 -$aligned: - /* - * We are now guaranteed to be quad aligned, with at least - * one partial quad to write. - */ - - sra $18, 3, $3 # U : Number of remaining quads to write - and $18, 7, $18 # E : Number of trailing bytes to write - mov $16, $5 # E : Save dest address - beq $3, $no_quad # U : tail stuff only - - /* - * It's worth the effort to unroll this and use wh64 if possible. - * At this point, entry values are: - * $16 Current destination address - * $5 A copy of $16 - * $6 The max quadword address to write to - * $18 Number trailer bytes - * $3 Number quads to write - */ - - and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) - subq $3, 16, $4 # E : Only try to unroll if > 128 bytes - subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) - blt $4, $loop # U : - - /* - * We know we've got at least 16 quads, minimum of one trip - * through unrolled loop. Do a quad at a time to get us 0mod64 - * aligned. - */ - - nop # E : - nop # E : - nop # E : - beq $1, $bigalign # U : - -$alignmod64: - stq $17, 0($5) # L : - subq $3, 1, $3 # E : For consistency later - addq $1, 8, $1 # E : Increment towards zero for alignment - addq $5, 8, $4 # E : Initial wh64 address (filler instruction) - - nop - nop - addq $5, 8, $5 # E : Inc address - blt $1, $alignmod64 # U : - -$bigalign: - /* - * $3 - number quads left to go - * $5 - target address (aligned 0mod64) - * $17 - mask of stuff to store - * Scratch registers available: $7, $2, $4, $1 - * We know that we'll be taking a minimum of one trip through. - * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle - * Assumes the wh64 needs to be for 2 trips through the loop in the future. - * The wh64 is issued on for the starting destination address for trip +2 - * through the loop, and if there are less than two trips left, the target - * address will be for the current trip. - */ - -$do_wh64: - wh64 ($4) # L1 : memory subsystem write hint - subq $3, 24, $2 # E : For determining future wh64 addresses - stq $17, 0($5) # L : - nop # E : - - addq $5, 128, $4 # E : speculative target of next wh64 - stq $17, 8($5) # L : - stq $17, 16($5) # L : - addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) - - stq $17, 24($5) # L : - stq $17, 32($5) # L : - cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle - nop - - stq $17, 40($5) # L : - stq $17, 48($5) # L : - subq $3, 16, $2 # E : Repeat the loop at least once more? - nop - - stq $17, 56($5) # L : - addq $5, 64, $5 # E : - subq $3, 8, $3 # E : - bge $2, $do_wh64 # U : - - nop - nop - nop - beq $3, $no_quad # U : Might have finished already - - .align 4 - /* - * Simple loop for trailing quadwords, or for small amounts - * of data (where we can't use an unrolled loop and wh64) - */ -$loop: - stq $17, 0($5) # L : - subq $3, 1, $3 # E : Decrement number quads left - addq $5, 8, $5 # E : Inc address - bne $3, $loop # U : more? - -$no_quad: - /* - * Write 0..7 trailing bytes. - */ - nop # E : - beq $18, $end # U : All done? - ldq $7, 0($5) # L : - mskqh $7, $6, $2 # U : Mask final quad - - insqh $17, $6, $4 # U : New bits - or $2, $4, $1 # E : Put it all together - stq $1, 0($5) # L : And back to memory - ret $31,($26),1 # L0 : - -$within_quad: - ldq_u $1, 0($16) # L : - insql $17, $16, $2 # U : New bits - mskql $1, $16, $4 # U : Clear old - or $2, $4, $2 # E : New result - - mskql $2, $6, $4 # U : - mskqh $1, $6, $2 # U : - or $2, $4, $1 # E : - stq_u $1, 0($16) # L : - -$end: - nop - nop - nop - ret $31,($26),1 # L0 : - - END(memset) -libc_hidden_builtin_def (memset) diff --git a/sysdeps/alpha/alphaev6/stxcpy.S b/sysdeps/alpha/alphaev6/stxcpy.S deleted file mode 100644 index 84f19581d1..0000000000 --- a/sysdeps/alpha/alphaev6/stxcpy.S +++ /dev/null @@ -1,314 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - Contributed by Richard Henderson (rth@tamu.edu) - EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -/* Copy a null-terminated string from SRC to DST. - - This is an internal routine used by strcpy, stpcpy, and strcat. - As such, it uses special linkage conventions to make implementation - of these public functions more efficient. - - On input: - t9 = return address - a0 = DST - a1 = SRC - - On output: - t8 = bitmask (with one bit set) indicating the last byte written - a0 = unaligned address of the last *word* written - - Furthermore, v0, a3-a5, t11, and t12 are untouched. -*/ - - -#include <sysdep.h> - - .arch ev6 - .set noat - .set noreorder - - .text - .type __stxcpy, @function - .globl __stxcpy - .usepv __stxcpy, no - - cfi_startproc - cfi_return_column (t9) - - /* On entry to this basic block: - t0 == the first destination word for masking back in - t1 == the first source word. */ - .align 4 -stxcpy_aligned: - /* Create the 1st output word and detect 0's in the 1st input word. */ - lda t2, -1 # E : build a mask against false zero - mskqh t2, a1, t2 # U : detection in the src word (stall) - mskqh t1, a1, t3 # U : - ornot t1, t2, t2 # E : (stall) - - mskql t0, a1, t0 # U : assemble the first output word - cmpbge zero, t2, t10 # E : bits set iff null found - or t0, t3, t1 # E : (stall) - bne t10, $a_eos # U : (stall) - - /* On entry to this basic block: - t0 == the first destination word for masking back in - t1 == a source word not containing a null. */ - /* Nops here to separate store quads from load quads */ - -$a_loop: - stq_u t1, 0(a0) # L : - addq a0, 8, a0 # E : - nop - nop - - ldq_u t1, 0(a1) # L : Latency=3 - addq a1, 8, a1 # E : - cmpbge zero, t1, t10 # E : (3 cycle stall) - beq t10, $a_loop # U : (stall for t10) - - /* Take care of the final (partial) word store. - On entry to this basic block we have: - t1 == the source word containing the null - t10 == the cmpbge mask that found it. */ -$a_eos: - negq t10, t6 # E : find low bit set - and t10, t6, t8 # E : (stall) - /* For the sake of the cache, don't read a destination word - if we're not going to need it. */ - and t8, 0x80, t6 # E : (stall) - bne t6, 1f # U : (stall) - - /* We're doing a partial word store and so need to combine - our source and original destination words. */ - ldq_u t0, 0(a0) # L : Latency=3 - subq t8, 1, t6 # E : - zapnot t1, t6, t1 # U : clear src bytes >= null (stall) - or t8, t6, t10 # E : (stall) - - zap t0, t10, t0 # E : clear dst bytes <= null - or t0, t1, t1 # E : (stall) - nop - nop - -1: stq_u t1, 0(a0) # L : - ret (t9) # L0 : Latency=3 - nop - nop - - .align 4 -__stxcpy: - /* Are source and destination co-aligned? */ - xor a0, a1, t0 # E : - unop # E : - and t0, 7, t0 # E : (stall) - bne t0, $unaligned # U : (stall) - - /* We are co-aligned; take care of a partial first word. */ - ldq_u t1, 0(a1) # L : load first src word - and a0, 7, t0 # E : take care not to load a word ... - addq a1, 8, a1 # E : - beq t0, stxcpy_aligned # U : ... if we wont need it (stall) - - ldq_u t0, 0(a0) # L : - br stxcpy_aligned # L0 : Latency=3 - nop - nop - - -/* The source and destination are not co-aligned. Align the destination - and cope. We have to be very careful about not reading too much and - causing a SEGV. */ - - .align 4 -$u_head: - /* We know just enough now to be able to assemble the first - full source word. We can still find a zero at the end of it - that prevents us from outputting the whole thing. - - On entry to this basic block: - t0 == the first dest word, for masking back in, if needed else 0 - t1 == the low bits of the first source word - t6 == bytemask that is -1 in dest word bytes */ - - ldq_u t2, 8(a1) # L : - addq a1, 8, a1 # E : - extql t1, a1, t1 # U : (stall on a1) - extqh t2, a1, t4 # U : (stall on a1) - - mskql t0, a0, t0 # U : - or t1, t4, t1 # E : - mskqh t1, a0, t1 # U : (stall on t1) - or t0, t1, t1 # E : (stall on t1) - - or t1, t6, t6 # E : - cmpbge zero, t6, t10 # E : (stall) - lda t6, -1 # E : for masking just below - bne t10, $u_final # U : (stall) - - mskql t6, a1, t6 # U : mask out the bits we have - or t6, t2, t2 # E : already extracted before (stall) - cmpbge zero, t2, t10 # E : testing eos (stall) - bne t10, $u_late_head_exit # U : (stall) - - /* Finally, we've got all the stupid leading edge cases taken care - of and we can set up to enter the main loop. */ - - stq_u t1, 0(a0) # L : store first output word - addq a0, 8, a0 # E : - extql t2, a1, t0 # U : position ho-bits of lo word - ldq_u t2, 8(a1) # U : read next high-order source word - - addq a1, 8, a1 # E : - cmpbge zero, t2, t10 # E : (stall for t2) - nop # E : - bne t10, $u_eos # U : (stall) - - /* Unaligned copy main loop. In order to avoid reading too much, - the loop is structured to detect zeros in aligned source words. - This has, unfortunately, effectively pulled half of a loop - iteration out into the head and half into the tail, but it does - prevent nastiness from accumulating in the very thing we want - to run as fast as possible. - - On entry to this basic block: - t0 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word - - We further know that t2 does not contain a null terminator. */ - - .align 3 -$u_loop: - extqh t2, a1, t1 # U : extract high bits for current word - addq a1, 8, a1 # E : (stall) - extql t2, a1, t3 # U : extract low bits for next time (stall) - addq a0, 8, a0 # E : - - or t0, t1, t1 # E : current dst word now complete - ldq_u t2, 0(a1) # L : Latency=3 load high word for next time - stq_u t1, -8(a0) # L : save the current word (stall) - mov t3, t0 # E : - - cmpbge zero, t2, t10 # E : test new word for eos - beq t10, $u_loop # U : (stall) - nop - nop - - /* We've found a zero somewhere in the source word we just read. - If it resides in the lower half, we have one (probably partial) - word to write out, and if it resides in the upper half, we - have one full and one partial word left to write out. - - On entry to this basic block: - t0 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word. */ -$u_eos: - extqh t2, a1, t1 # U : - or t0, t1, t1 # E : first (partial) source word complete (stall) - cmpbge zero, t1, t10 # E : is the null in this first bit? (stall) - bne t10, $u_final # U : (stall) - -$u_late_head_exit: - stq_u t1, 0(a0) # L : the null was in the high-order bits - addq a0, 8, a0 # E : - extql t2, a1, t1 # U : - cmpbge zero, t1, t10 # E : (stall) - - /* Take care of a final (probably partial) result word. - On entry to this basic block: - t1 == assembled source word - t10 == cmpbge mask that found the null. */ -$u_final: - negq t10, t6 # E : isolate low bit set - and t6, t10, t8 # E : (stall) - and t8, 0x80, t6 # E : avoid dest word load if we can (stall) - bne t6, 1f # U : (stall) - - ldq_u t0, 0(a0) # E : - subq t8, 1, t6 # E : - or t6, t8, t10 # E : (stall) - zapnot t1, t6, t1 # U : kill source bytes >= null (stall) - - zap t0, t10, t0 # U : kill dest bytes <= null (2 cycle data stall) - or t0, t1, t1 # E : (stall) - nop - nop - -1: stq_u t1, 0(a0) # L : - ret (t9) # L0 : Latency=3 - nop - nop - - /* Unaligned copy entry point. */ - .align 4 -$unaligned: - - ldq_u t1, 0(a1) # L : load first source word - and a0, 7, t4 # E : find dest misalignment - and a1, 7, t5 # E : find src misalignment - /* Conditionally load the first destination word and a bytemask - with 0xff indicating that the destination byte is sacrosanct. */ - mov zero, t0 # E : - - mov zero, t6 # E : - beq t4, 1f # U : - ldq_u t0, 0(a0) # L : - lda t6, -1 # E : - - mskql t6, a0, t6 # U : - nop - nop - nop -1: - subq a1, t4, a1 # E : sub dest misalignment from src addr - /* If source misalignment is larger than dest misalignment, we need - extra startup checks to avoid SEGV. */ - cmplt t4, t5, t8 # E : - beq t8, $u_head # U : - lda t2, -1 # E : mask out leading garbage in source - - mskqh t2, t5, t2 # U : - ornot t1, t2, t3 # E : (stall) - cmpbge zero, t3, t10 # E : is there a zero? (stall) - beq t10, $u_head # U : (stall) - - /* At this point we've found a zero in the first partial word of - the source. We need to isolate the valid source data and mask - it into the original destination data. (Incidentally, we know - that we'll need at least one byte of that original dest word.) */ - - ldq_u t0, 0(a0) # L : - negq t10, t6 # E : build bitmask of bytes <= zero - and t6, t10, t8 # E : (stall) - and a1, 7, t5 # E : - - subq t8, 1, t6 # E : - or t6, t8, t10 # E : (stall) - srl t8, t5, t8 # U : adjust final null return value - zapnot t2, t10, t2 # U : prepare source word; mirror changes (stall) - - and t1, t2, t1 # E : to source validity mask - extql t2, a1, t2 # U : - extql t1, a1, t1 # U : (stall) - andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) - - or t0, t1, t1 # e1 : and put it there - stq_u t1, 0(a0) # .. e0 : (stall) - ret (t9) # e1 : - - cfi_endproc diff --git a/sysdeps/alpha/alphaev6/stxncpy.S b/sysdeps/alpha/alphaev6/stxncpy.S deleted file mode 100644 index ad094cc1df..0000000000 --- a/sysdeps/alpha/alphaev6/stxncpy.S +++ /dev/null @@ -1,392 +0,0 @@ -/* Copyright (C) 2000-2017 Free Software Foundation, Inc. - Contributed by Richard Henderson (rth@tamu.edu) - EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <http://www.gnu.org/licenses/>. */ - -/* Copy no more than COUNT bytes of the null-terminated string from - SRC to DST. - - This is an internal routine used by strncpy, stpncpy, and strncat. - As such, it uses special linkage conventions to make implementation - of these public functions more efficient. - - On input: - t9 = return address - a0 = DST - a1 = SRC - a2 = COUNT - - Furthermore, COUNT may not be zero. - - On output: - t0 = last word written - t8 = bitmask (with one bit set) indicating the last byte written - t10 = bitmask (with one bit set) indicating the byte position of - the end of the range specified by COUNT - a0 = unaligned address of the last *word* written - a2 = the number of full words left in COUNT - - Furthermore, v0, a3-a5, t11, and t12 are untouched. -*/ - -#include <sysdep.h> - - .arch ev6 - .set noat - .set noreorder - - .text - .type __stxncpy, @function - .globl __stxncpy - .usepv __stxncpy, no - - cfi_startproc - cfi_return_column (t9) - - /* On entry to this basic block: - t0 == the first destination word for masking back in - t1 == the first source word. */ - .align 4 -stxncpy_aligned: - /* Create the 1st output word and detect 0's in the 1st input word. */ - lda t2, -1 # E : build a mask against false zero - mskqh t2, a1, t2 # U : detection in the src word (stall) - mskqh t1, a1, t3 # U : - ornot t1, t2, t2 # E : (stall) - - mskql t0, a1, t0 # U : assemble the first output word - cmpbge zero, t2, t7 # E : bits set iff null found - or t0, t3, t0 # E : (stall) - beq a2, $a_eoc # U : - - bne t7, $a_eos # U : - nop - nop - nop - - /* On entry to this basic block: - t0 == a source word not containing a null. */ - - /* - * nops here to: - * separate store quads from load quads - * limit of 1 bcond/quad to permit training - */ -$a_loop: - stq_u t0, 0(a0) # L : - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - nop - - ldq_u t0, 0(a1) # L : - addq a1, 8, a1 # E : - cmpbge zero, t0, t7 # E : - beq a2, $a_eoc # U : - - beq t7, $a_loop # U : - nop - nop - nop - - /* Take care of the final (partial) word store. At this point - the end-of-count bit is set in t7 iff it applies. - - On entry to this basic block we have: - t0 == the source word containing the null - t7 == the cmpbge mask that found it. */ -$a_eos: - negq t7, t8 # E : find low bit set - and t7, t8, t8 # E : (stall) - /* For the sake of the cache, don't read a destination word - if we're not going to need it. */ - and t8, 0x80, t6 # E : (stall) - bne t6, 1f # U : (stall) - - /* We're doing a partial word store and so need to combine - our source and original destination words. */ - ldq_u t1, 0(a0) # L : - subq t8, 1, t6 # E : - or t8, t6, t7 # E : (stall) - zapnot t0, t7, t0 # U : clear src bytes > null (stall) - - zap t1, t7, t1 # .. e1 : clear dst bytes <= null - or t0, t1, t0 # e1 : (stall) - nop - nop - -1: stq_u t0, 0(a0) # L : - ret (t9) # L0 : Latency=3 - nop - nop - - /* Add the end-of-count bit to the eos detection bitmask. */ -$a_eoc: - or t10, t7, t7 # E : - br $a_eos # L0 : Latency=3 - nop - nop - - .align 4 -__stxncpy: - /* Are source and destination co-aligned? */ - lda t2, -1 # E : - xor a0, a1, t1 # E : - and a0, 7, t0 # E : find dest misalignment - nop # E : - - srl t2, 1, t2 # U : - and t1, 7, t1 # E : - cmovlt a2, t2, a2 # E : bound count to LONG_MAX (stall) - nop # E : - - addq a2, t0, a2 # E : bias count by dest misalignment - subq a2, 1, a2 # E : (stall) - and a2, 7, t2 # E : (stall) - lda t10, 1 # E : - - srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 - sll t10, t2, t10 # U : t10 = bitmask of last count byte - nop # E : - bne t1, $unaligned # U : (stall) - - /* We are co-aligned; take care of a partial first word. */ - ldq_u t1, 0(a1) # L : load first src word - addq a1, 8, a1 # E : - beq t0, stxncpy_aligned # U : avoid loading dest word if not needed - ldq_u t0, 0(a0) # L : - - br stxncpy_aligned # U : - nop - nop - nop - - - -/* The source and destination are not co-aligned. Align the destination - and cope. We have to be very careful about not reading too much and - causing a SEGV. */ - - .align 4 -$u_head: - /* We know just enough now to be able to assemble the first - full source word. We can still find a zero at the end of it - that prevents us from outputting the whole thing. - - On entry to this basic block: - t0 == the first dest word, unmasked - t1 == the shifted low bits of the first source word - t6 == bytemask that is -1 in dest word bytes */ - - ldq_u t2, 8(a1) # L : Latency=3 load second src word - addq a1, 8, a1 # E : - mskql t0, a0, t0 # U : mask trailing garbage in dst - extqh t2, a1, t4 # U : (3 cycle stall on t2) - - or t1, t4, t1 # E : first aligned src word complete (stall) - mskqh t1, a0, t1 # U : mask leading garbage in src (stall) - or t0, t1, t0 # E : first output word complete (stall) - or t0, t6, t6 # E : mask original data for zero test (stall) - - cmpbge zero, t6, t7 # E : - beq a2, $u_eocfin # U : - lda t6, -1 # E : - nop - - bne t7, $u_final # U : - mskql t6, a1, t6 # U : mask out bits already seen - stq_u t0, 0(a0) # L : store first output word - or t6, t2, t2 # E : - - cmpbge zero, t2, t7 # E : find nulls in second partial - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - bne t7, $u_late_head_exit # U : - - /* Finally, we've got all the stupid leading edge cases taken care - of and we can set up to enter the main loop. */ - extql t2, a1, t1 # U : position hi-bits of lo word - beq a2, $u_eoc # U : - ldq_u t2, 8(a1) # L : read next high-order source word - addq a1, 8, a1 # E : - - extqh t2, a1, t0 # U : position lo-bits of hi word (stall) - cmpbge zero, t2, t7 # E : - nop - bne t7, $u_eos # U : - - /* Unaligned copy main loop. In order to avoid reading too much, - the loop is structured to detect zeros in aligned source words. - This has, unfortunately, effectively pulled half of a loop - iteration out into the head and half into the tail, but it does - prevent nastiness from accumulating in the very thing we want - to run as fast as possible. - - On entry to this basic block: - t0 == the shifted low-order bits from the current source word - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word - - We further know that t2 does not contain a null terminator. */ - - .align 4 -$u_loop: - or t0, t1, t0 # E : current dst word now complete - subq a2, 1, a2 # E : decrement word count - extql t2, a1, t1 # U : extract high bits for next time - addq a0, 8, a0 # E : - - stq_u t0, -8(a0) # L : save the current word - beq a2, $u_eoc # U : - ldq_u t2, 8(a1) # L : Latency=3 load high word for next time - addq a1, 8, a1 # E : - - extqh t2, a1, t0 # U : extract low bits (2 cycle stall) - cmpbge zero, t2, t7 # E : test new word for eos - nop - beq t7, $u_loop # U : - - /* We've found a zero somewhere in the source word we just read. - If it resides in the lower half, we have one (probably partial) - word to write out, and if it resides in the upper half, we - have one full and one partial word left to write out. - - On entry to this basic block: - t0 == the shifted low-order bits from the current source word - t1 == the shifted high-order bits from the previous source word - t2 == the unshifted current source word. */ -$u_eos: - or t0, t1, t0 # E : first (partial) source word complete - nop - cmpbge zero, t0, t7 # E : is the null in this first bit? (stall) - bne t7, $u_final # U : (stall) - - stq_u t0, 0(a0) # L : the null was in the high-order bits - addq a0, 8, a0 # E : - subq a2, 1, a2 # E : - nop - -$u_late_head_exit: - extql t2, a1, t0 # U : - cmpbge zero, t0, t7 # E : - or t7, t10, t6 # E : (stall) - cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall) - - /* Take care of a final (probably partial) result word. - On entry to this basic block: - t0 == assembled source word - t7 == cmpbge mask that found the null. */ -$u_final: - negq t7, t6 # E : isolate low bit set - and t6, t7, t8 # E : (stall) - and t8, 0x80, t6 # E : avoid dest word load if we can (stall) - bne t6, 1f # U : (stall) - - ldq_u t1, 0(a0) # L : - subq t8, 1, t6 # E : - or t6, t8, t7 # E : (stall) - zapnot t0, t7, t0 # U : kill source bytes > null - - zap t1, t7, t1 # U : kill dest bytes <= null - or t0, t1, t0 # E : (stall) - nop - nop - -1: stq_u t0, 0(a0) # L : - ret (t9) # L0 : Latency=3 - - /* Got to end-of-count before end of string. - On entry to this basic block: - t1 == the shifted high-order bits from the previous source word */ -$u_eoc: - and a1, 7, t6 # E : - sll t10, t6, t6 # U : (stall) - and t6, 0xff, t6 # E : (stall) - bne t6, 1f # U : (stall) - - ldq_u t2, 8(a1) # L : load final src word - nop - extqh t2, a1, t0 # U : extract low bits for last word (stall) - or t1, t0, t1 # E : (stall) - -1: cmpbge zero, t1, t7 # E : - mov t1, t0 - -$u_eocfin: # end-of-count, final word - or t10, t7, t7 # E : - br $u_final # L0 : Latency=3 - - /* Unaligned copy entry point. */ - .align 4 -$unaligned: - - ldq_u t1, 0(a1) # L : load first source word - and a0, 7, t4 # E : find dest misalignment - and a1, 7, t5 # E : find src misalignment - /* Conditionally load the first destination word and a bytemask - with 0xff indicating that the destination byte is sacrosanct. */ - mov zero, t0 # E : - - mov zero, t6 # E : - beq t4, 1f # U : - ldq_u t0, 0(a0) # L : - lda t6, -1 # E : - - mskql t6, a0, t6 # U : - nop - nop -1: subq a1, t4, a1 # E : sub dest misalignment from src addr - - /* If source misalignment is larger than dest misalignment, we need - extra startup checks to avoid SEGV. */ - - cmplt t4, t5, t8 # E : - extql t1, a1, t1 # U : shift src into place - lda t2, -1 # E : for creating masks later - beq t8, $u_head # U : (stall) - - mskqh t2, t5, t2 # U : begin src byte validity mask - cmpbge zero, t1, t7 # E : is there a zero? - extql t2, a1, t2 # U : - or t7, t10, t5 # E : test for end-of-count too - - cmpbge zero, t2, t3 # E : - cmoveq a2, t5, t7 # E : Latency=2, extra map slot - nop # E : keep with cmoveq - andnot t7, t3, t7 # E : (stall) - - beq t7, $u_head # U : - /* At this point we've found a zero in the first partial word of - the source. We need to isolate the valid source data and mask - it into the original destination data. (Incidentally, we know - that we'll need at least one byte of that original dest word.) */ - ldq_u t0, 0(a0) # L : - negq t7, t6 # E : build bitmask of bytes <= zero - mskqh t1, t4, t1 # U : - - and t6, t7, t8 # E : - subq t8, 1, t6 # E : (stall) - or t6, t8, t7 # E : (stall) - zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall) - - zapnot t1, t7, t1 # U : to source validity mask - andnot t0, t2, t0 # E : zero place for source to reside - or t0, t1, t0 # E : and put it there (stall both t0, t1) - stq_u t0, 0(a0) # L : (stall) - - ret (t9) # L0 : Latency=3 - - cfi_endproc |