aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/alpha/alphaev6
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2000-12-08 17:27:11 +0000
committerUlrich Drepper <drepper@redhat.com>2000-12-08 17:27:11 +0000
commit104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c (patch)
tree211aa48f0dbfe5720f4fadaf86e33f9a7dbb5dc8 /sysdeps/alpha/alphaev6
parent4e9b4067d74d2d943de6918f39c15eb8f8f11b22 (diff)
downloadglibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar.gz
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar.bz2
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.zip
Update.
2000-12-08 Jakub Jelinek <jakub@redhat.com> * elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at cp + len. Compute where from dirname. Reported by <jreiser@BitWagon.com>. 2000-12-08 Richard Henderson <rth@twiddle.net> * sysdeps/alpha/_mcount.S: Fix typo. * sysdeps/alpha/strncpy.S: Likewise. * sysdeps/alpha/alphaev6/Implies: New file. * sysdeps/alpha/alphaev67/Implies: New file. * sysdeps/alpha/alphaev67/ffs.S: New file. * sysdeps/alpha/alphaev67/ffsll.S: New file. * sysdeps/alpha/alphaev67/rawmemchr.S: New file. * sysdeps/alpha/alphaev67/stpcpy.S: New file. * sysdeps/alpha/alphaev67/stpncpy.S: New file. * sysdeps/alpha/rawmemchr.S: New file. * sysdeps/alpha/strcat.S: Tail call to __stxcpy. * sysdeps/alpha/strcpy.S: Likewise. From GMP 3.1.1: * sysdeps/alpha/alphaev6/addmul_1.s: New file. From rick.gorton@alpha-processor.com: * sysdeps/alpha/alphaev6/memchr.S: New file. * sysdeps/alpha/alphaev6/memcpy.S: New file. * sysdeps/alpha/alphaev6/memset.S: New file. * sysdeps/alpha/alphaev6/stxcpy.S: New file. * sysdeps/alpha/alphaev6/stxncpy.S: New file. * sysdeps/alpha/alphaev67/strcat.S: New file. * sysdeps/alpha/alphaev67/strchr.S: New file. * sysdeps/alpha/alphaev67/strlen.S: New file. * sysdeps/alpha/alphaev67/strncat.S: New file. * sysdeps/alpha/htonl.S: Use a shorter sequence. 2000-12-08 Jakub Jelinek <jakub@redhat.com> * inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support. Reported by <pspencer@fields.utoronto.ca>. 2000-12-07 Jes Sorensen <jes@linuxcare.com> * sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable. Pointed out by Hans Boehm. 2000-12-07 H.J. Lu <hjl@gnu.org> * elf/dl-version.c (match_symbol): Check map->l_name[0] for printing. 2000-12-07 Andreas Jaeger <aj@suse.de> * misc/error.c: Add format attributes for __error and __error_at_line. * nscd/dbg_log.h: Add format attribute. 2000-12-08 Ulrich Drepper <drepper@redhat.com> * misc/sys/syslog.h: Add format attributes to syslog and vsyslog. Patch by Joseph S. Myers <jsm28@cam.ac.uk>. * sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap. * manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
Diffstat (limited to 'sysdeps/alpha/alphaev6')
-rw-r--r--sysdeps/alpha/alphaev6/Implies1
-rw-r--r--sysdeps/alpha/alphaev6/addmul_1.s479
-rw-r--r--sysdeps/alpha/alphaev6/memchr.S192
-rw-r--r--sysdeps/alpha/alphaev6/memcpy.S254
-rw-r--r--sysdeps/alpha/alphaev6/memset.S224
-rw-r--r--sysdeps/alpha/alphaev6/stxcpy.S329
-rw-r--r--sysdeps/alpha/alphaev6/stxncpy.S405
7 files changed, 1884 insertions, 0 deletions
diff --git a/sysdeps/alpha/alphaev6/Implies b/sysdeps/alpha/alphaev6/Implies
new file mode 100644
index 0000000000..0e7fc170ba
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/Implies
@@ -0,0 +1 @@
+alpha/alphaev5
diff --git a/sysdeps/alpha/alphaev6/addmul_1.s b/sysdeps/alpha/alphaev6/addmul_1.s
new file mode 100644
index 0000000000..a061fb9edb
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/addmul_1.s
@@ -0,0 +1,479 @@
+ # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+ # the result to a second limb vector.
+ #
+ # Copyright (C) 2000 Free Software Foundation, Inc.
+ #
+ # This file is part of the GNU MP Library.
+ #
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published
+ # by the Free Software Foundation; either version 2.1 of the License, or (at
+ # your option) any later version.
+ #
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+ # License for more details.
+ #
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+ # INPUT PARAMETERS
+ # res_ptr $16
+ # s1_ptr $17
+ # size $18
+ # s2_limb $19
+ #
+ # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+ # exactly 3.625 cycles/limb on EV6...
+ #
+ # This code was written in close cooperation with ev6 pipeline expert
+ # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
+ #
+ # Register usages for unrolled loop:
+ # 0-3 mul's
+ # 4-7 acc's
+ # 8-15 mul results
+ # 20,21 carry's
+ # 22,23 save for stores
+ #
+ # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+ #
+ # The stores can issue a cycle late so we have paired no-op's to 'catch'
+ # them, so that further disturbance to the schedule is damped.
+ #
+ # We couldn't pair the loads, because the entangled schedule of the
+ # carry's has to happen on one side {0} of the machine. Note, the total
+ # use of U0, and the total use of L0 (after attending to the stores).
+ # which is part of the reason why....
+ #
+ # This is a great schedule for the d_cache, a poor schedule for the
+ # b_cache. The lockup on U0 means that any stall can't be recovered
+ # from. Consider a ldq in L1. say that load gets stalled because it
+ # collides with a fill from the b_Cache. On the next cycle, this load
+ # gets priority. If first looks at L0, and goes there. The instruction
+ # we intended for L0 gets to look at L1, which is NOT where we want
+ # it. It either stalls 1, because it can't go in L0, or goes there, and
+ # causes a further instruction to stall.
+ #
+ # So for b_cache, we're likely going to want to put one or more cycles
+ # back into the code! And, of course, put in prefetches. For the
+ # accumulator, lds, intent to modify. For the multiplier, you might
+ # want ldq, evict next, if you're not wanting to use it again soon. Use
+ # 256 ahead of present pointer value. At a place where we have an mt
+ # followed by a bookkeeping, put the bookkeeping in upper, and the
+ # prefetch into lower.
+ #
+ # Note, the usage of physical registers per cycle is smoothed off, as
+ # much as possible.
+ #
+ # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
+ # like not to have a ldq or stq to preceded a conditional branch in a
+ # quadpack. The conditional branch moves the retire pointer one cycle
+ # later.
+ #
+ # Optimization notes:
+ # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
+ # Reserved regs: $29 $30 $31
+ # Free caller-saves regs in unrolled code: $24 $25 $28
+ # We should swap some of the callee-saves regs for some of the free
+ # caller-saves regs, saving some overhead cycles.
+ # Most importantly, we should write fast code for the 0-7 case.
+ # The code we use there are for the 21164, and runs at 7 cycles/limb
+ # on the 21264. Should not be hard, if we write specialized code for
+ # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
+ # need a jump table indexed by the low 3 bits of the count argument.
+
+ .set noreorder
+ .set noat
+ .text
+
+ .globl __mpn_addmul_1
+ .ent __mpn_addmul_1
+__mpn_addmul_1:
+ .frame $30,0,$26,0
+ .prologue 0
+
+ cmpult $18, 8, $1
+ beq $1, $Large
+
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $18, $Lend0b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $18, 1, $18 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $18, $Lend0a # jump if size was == 2
+
+ .align 3
+$Loop0: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $18, 1, $18 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $18, $Loop0
+$Lend0a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ ret $31, ($26), 1
+$Lend0b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $0, $5, $0
+ ret $31, ($26), 1
+
+$Large:
+ lda $30, -240($30)
+ stq $9, 8($30)
+ stq $10, 16($30)
+ stq $11, 24($30)
+ stq $12, 32($30)
+ stq $13, 40($30)
+ stq $14, 48($30)
+ stq $15, 56($30)
+
+ and $18, 7, $20 # count for the first loop, 0-7
+ srl $18, 3, $18 # count for unrolled loop
+ bis $31, $31, $0
+ beq $20, $Lunroll
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ umulh $2, $19, $0 # $0 = prod_high
+ beq $20, $Lend1b # jump if size was == 1
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ subq $20, 1, $20 # size--
+ addq $5, $3, $3
+ cmpult $3, $5, $4
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ beq $20, $Lend1a # jump if size was == 2
+
+ .align 3
+$Loop1: mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ subq $20, 1, $20 # size--
+ umulh $2, $19, $4 # $4 = cy_limb
+ ldq $2, 0($17) # $2 = s1_limb
+ addq $17, 8, $17 # s1_ptr++
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ bne $20, $Loop1
+
+$Lend1a:
+ mulq $2, $19, $3 # $3 = prod_low
+ ldq $5, 0($16) # $5 = *res_ptr
+ addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
+ umulh $2, $19, $4 # $4 = cy_limb
+ addq $3, $0, $3 # $3 = cy_limb + prod_low
+ cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $5, $0, $0 # combine carries
+ addq $4, $0, $0 # cy_limb = prod_high + cy
+ br $31, $Lunroll
+$Lend1b:
+ addq $5, $3, $3
+ cmpult $3, $5, $5
+ stq $3, 0($16)
+ addq $16, 8, $16 # res_ptr++
+ addq $0, $5, $0
+
+$Lunroll:
+ lda $17, -16($17) # L1 bookkeeping
+ lda $16, -16($16) # L1 bookkeeping
+ bis $0, $31, $12
+
+ # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+ ldq $2, 16($17) # L1
+ ldq $3, 24($17) # L1
+ lda $18, -1($18) # L1 bookkeeping
+ ldq $6, 16($16) # L1
+ ldq $7, 24($16) # L1
+ ldq $0, 32($17) # L1
+ mulq $19, $2, $13 # U1
+ ldq $1, 40($17) # L1
+ umulh $19, $2, $14 # U1
+ mulq $19, $3, $15 # U1
+ lda $17, 64($17) # L1 bookkeeping
+ ldq $4, 32($16) # L1
+ ldq $5, 40($16) # L1
+ umulh $19, $3, $8 # U1
+ ldq $2, -16($17) # L1
+ mulq $19, $0, $9 # U1
+ ldq $3, -8($17) # L1
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ mulq $19, $1, $11 # U1
+ cmpult $6, $13, $20 # L0 lo add => carry
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+ mulq $19, $3, $15 # U1
+ addq $8, $21, $8 # U0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ ble $18, $Lend # U1 bookkeeping
+
+ # ____ MAIN UNROLLED LOOP ____
+ .align 4
+$Loop:
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, 16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, 24($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $18, -1($18) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, 16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, 24($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 32($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 40($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $17, 64($17) # L1 bookkeeping
+ addq $4, $8, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 32($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 40($16) # L1
+
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ ldq $2, -16($17) # L1
+
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ ldq $3, -8($17) # L1
+
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $1, $11 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $12, $21, $12 # U0 hi mul + carry
+
+ cmpult $6, $13, $20 # L0 lo add => carry
+ bis $31, $31, $31 # U1 mt
+ lda $16, 64($16) # L1 bookkeeping
+ addq $6, $12, $22 # U0 hi add => answer
+
+ bis $31, $31, $31 # U1 mt
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ ldq $6, -16($16) # L1
+
+ bis $31, $31, $31 # U1 mt
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ ldq $7, -8($16) # L1
+
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ ldq $0, 0($17) # L1
+
+ mulq $19, $2, $13 # U1
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ ldq $1, 8($17) # L1
+
+ umulh $19, $2, $14 # U1
+ addq $4, $9, $4 # L0 lo + acc
+ stq $22, -48($16) # L0
+ stq $23, -40($16) # L1
+
+ bis $31, $31, $31 # L0 st slosh
+ mulq $19, $3, $15 # U1
+ bis $31, $31, $31 # L1 st slosh
+ addq $8, $21, $8 # U0 hi mul + carry
+
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ bis $31, $31, $31 # L1 mt
+ bgt $18, $Loop # U1 bookkeeping
+
+# ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ ldq $4, 0($16) # L1
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ ldq $5, 8($16) # L1
+ umulh $19, $3, $8 # U1
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ mulq $19, $0, $9 # U1
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ umulh $19, $0, $10 # U1
+ addq $6, $13, $6 # L0 lo + acc
+ stq $22, -32($16) # L0
+ stq $23, -24($16) # L1
+ mulq $19, $1, $11 # U1
+ addq $12, $21, $12 # U0 hi mul + carry
+ cmpult $6, $13, $20 # L0 lo add => carry
+ addq $6, $12, $22 # U0 hi add => answer
+ cmpult $22, $12, $21 # L0 hi add => carry
+ addq $14, $20, $14 # U0 hi mul + carry
+ addq $7, $15, $23 # L0 lo + acc
+ addq $14, $21, $14 # U0 hi mul + carry
+ umulh $19, $1, $12 # U1
+ cmpult $23, $15, $20 # L0 lo add => carry
+ addq $23, $14, $23 # U0 hi add => answer
+ cmpult $23, $14, $21 # L0 hi add => carry
+ addq $8, $20, $8 # U0 hi mul + carry
+ addq $4, $9, $4 # U0 lo + acc
+ stq $22, -16($16) # L0
+ stq $23, -8($16) # L1
+ bis $31, $31, $31 # L0 st slosh
+ addq $8, $21, $8 # L0 hi mul + carry
+ cmpult $4, $9, $20 # L0 lo add => carry
+ addq $4, $8, $22 # U0 hi add => answer
+ cmpult $22, $8, $21 # L0 hi add => carry
+ addq $10, $20, $10 # U0 hi mul + carry
+ addq $5, $11, $23 # L0 lo + acc
+ addq $10, $21, $10 # L0 hi mul + carry
+ cmpult $23, $11, $20 # L0 lo add => carry
+ addq $23, $10, $23 # U0 hi add => answer
+ cmpult $23, $10, $21 # L0 hi add => carry
+ addq $12, $20, $12 # U0 hi mul + carry
+ stq $22, 0($16) # L0
+ stq $23, 8($16) # L1
+ addq $12, $21, $0 # U0 hi mul + carry
+
+ ldq $9, 8($30)
+ ldq $10, 16($30)
+ ldq $11, 24($30)
+ ldq $12, 32($30)
+ ldq $13, 40($30)
+ ldq $14, 48($30)
+ ldq $15, 56($30)
+ lda $30, 240($30)
+ ret $31, ($26), 1
+
+ .end __mpn_addmul_1
diff --git a/sysdeps/alpha/alphaev6/memchr.S b/sysdeps/alpha/alphaev6/memchr.S
new file mode 100644
index 0000000000..0dfcbea76a
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/memchr.S
@@ -0,0 +1,192 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by David Mosberger (davidm@cs.arizona.edu).
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(__memchr)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ # Hack -- if someone passes in (size_t)-1, hoping to just
+ # search til the end of the address space, we will overflow
+ # below when we find the address of the last byte. Given
+ # that we will never have a 56-bit address space, cropping
+ # the length is the easiest way to avoid trouble.
+ zap $18, 0x80, $5 # U : Bound length
+ beq $18, $not_found # U :
+ ldq_u $1, 0($16) # L : load first quadword Latency=3
+ and $17, 0xff, $17 # E : L L U U : 00000000000000ch
+
+ insbl $17, 1, $2 # U : 000000000000ch00
+ cmpult $18, 9, $4 # E : small (< 1 quad) string?
+ or $2, $17, $17 # E : 000000000000chch
+ lda $3, -1($31) # E : U L L U
+
+ sll $17, 16, $2 # U : 00000000chch0000
+ addq $16, $5, $5 # E : Max search address
+ or $2, $17, $17 # E : 00000000chchchch
+ sll $17, 32, $2 # U : U L L U : chchchch00000000
+
+ or $2, $17, $17 # E : chchchchchchchch
+ extql $1, $16, $7 # U : $7 is upper bits
+ beq $4, $first_quad # U :
+ ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
+
+ extqh $6, $16, $6 # U : 2 cycle stall for $6
+ mov $16, $0 # E :
+ nop # E :
+ or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
+
+ # Deal with the case where at most 8 bytes remain to be searched
+ # in $1. E.g.:
+ # $18 = 6
+ # $1 = ????c6c5c4c3c2c1
+$last_quad:
+ negq $18, $6 # E :
+ xor $17, $1, $1 # E :
+ srl $3, $6, $6 # U : $6 = mask of $18 bits set
+ cmpbge $31, $1, $2 # E : L U L U
+
+ nop
+ nop
+ and $2, $6, $2 # E :
+ beq $2, $not_found # U : U L U L
+
+$found_it:
+#if defined(__alpha_fix__) && defined(__alpha_cix__)
+ /*
+ * Since we are guaranteed to have set one of the bits, we don't
+ * have to worry about coming back with a 0x40 out of cttz...
+ */
+ cttz $2, $3 # U0 :
+ addq $0, $3, $0 # E : All done
+ nop # E :
+ ret # L0 : L U L U
+#else
+ /*
+ * Slow and clunky. It can probably be improved.
+ * An exercise left for others.
+ */
+ negq $2, $3 # E :
+ and $2, $3, $2 # E :
+ and $2, 0x0f, $1 # E :
+ addq $0, 4, $3 # E :
+
+ cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
+ nop # E : keep with cmov
+ and $2, 0x33, $1 # E :
+ addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
+
+ cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
+ nop # E : keep with cmov
+ and $2, 0x55, $1 # E :
+ addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
+
+ cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
+ nop
+ nop
+ ret # L0 : L U L U
+#endif
+
+ # Deal with the case where $18 > 8 bytes remain to be
+ # searched. $16 may not be aligned.
+ .align 4
+$first_quad:
+ andnot $16, 0x7, $0 # E :
+ insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
+ xor $1, $17, $1 # E :
+ or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
+
+ cmpbge $31, $1, $2 # E :
+ bne $2, $found_it # U :
+ # At least one byte left to process.
+ ldq $1, 8($0) # L :
+ subq $5, 1, $18 # E : U L U L
+
+ addq $0, 8, $0 # E :
+ # Make $18 point to last quad to be accessed (the
+ # last quad may or may not be partial).
+ andnot $18, 0x7, $18 # E :
+ cmpult $0, $18, $2 # E :
+ beq $2, $final # U : U L U L
+
+ # At least two quads remain to be accessed.
+
+ subq $18, $0, $4 # E : $4 <- nr quads to be processed
+ and $4, 8, $4 # E : odd number of quads?
+ bne $4, $odd_quad_count # U :
+ # At least three quads remain to be accessed
+ mov $1, $4 # E : L U L U : move prefetched value to correct reg
+
+ .align 4
+$unrolled_loop:
+ ldq $1, 8($0) # L : prefetch $1
+ xor $17, $4, $2 # E :
+ cmpbge $31, $2, $2 # E :
+ bne $2, $found_it # U : U L U L
+
+ addq $0, 8, $0 # E :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$odd_quad_count:
+ xor $17, $1, $2 # E :
+ ldq $4, 8($0) # L : prefetch $4
+ cmpbge $31, $2, $2 # E :
+ addq $0, 8, $6 # E :
+
+ bne $2, $found_it # U :
+ cmpult $6, $18, $6 # E :
+ addq $0, 8, $0 # E :
+ nop # E :
+
+ bne $6, $unrolled_loop # U :
+ mov $4, $1 # E : move prefetched value into $1
+ nop # E :
+ nop # E :
+
+$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
+ nop # E :
+ nop # E :
+ bne $18, $last_quad # U :
+
+$not_found:
+ mov $31, $0 # E :
+ nop # E :
+ nop # E :
+ ret # L0 :
+
+ END(__memchr)
+
+weak_alias (__memchr, memchr)
+#if !__BOUNDED_POINTERS__
+weak_alias (__memchr, __ubp_memchr)
+#endif
diff --git a/sysdeps/alpha/alphaev6/memcpy.S b/sysdeps/alpha/alphaev6/memcpy.S
new file mode 100644
index 0000000000..35f17e7f9e
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/memcpy.S
@@ -0,0 +1,254 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ * Compiler Writer's Guide for the Alpha 21264
+ * abbreviated as 'CWG' in other comments here
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ * E - either cluster
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ * $0 - destination address
+ * $1,$2, - scratch
+ */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(memcpy)
+
+ mov $16, $0 # E : copy dest to return
+ ble $18, $nomoredata # U : done with the copy?
+ xor $16, $17, $1 # E : are source and dest alignments the same?
+ and $1, 7, $1 # E : are they the same mod 8?
+
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
+ /* source and dest are same mod 8 address */
+ and $16, 7, $1 # E : Are both 0mod8?
+ beq $1, $both_0mod8 # U : Yes
+ nop # E :
+
+ /*
+ * source and dest are same misalignment. move a byte at a time
+ * until a 0mod8 alignment for both is reached.
+ * At least one byte more to move
+ */
+
+$head_align:
+ ldbu $1, 0($17) # L : grab a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ stb $1, 0($16) # L :
+ addq $16, 1, $16 # E : dest++
+ and $16, 7, $1 # E : Are we at 0mod8 yet?
+ ble $18, $nomoredata # U : done with the copy?
+ bne $1, $head_align # U :
+
+$both_0mod8:
+ cmple $18, 127, $1 # E : Can we unroll the loop?
+ bne $1, $no_unroll # U :
+ and $16, 63, $1 # E : get mod64 alignment
+ beq $1, $do_unroll # U : no single quads to fiddle
+
+$single_head_quad:
+ ldq $1, 0($17) # L : get 8 bytes
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store
+ addq $16, 8, $16 # E : dest += 8
+ and $16, 63, $1 # E : get mod64 alignment
+ bne $1, $single_head_quad # U : still not fully aligned
+
+$do_unroll:
+ addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
+ cmple $18, 63, $1 # E : Can we go through the unrolled loop?
+ bne $1, $tail_quads # U : Nope
+ nop # E :
+
+$unroll_body:
+ wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
+ # ($7) are about to be over-written
+ ldq $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldq $4, 8($17) # L : bytes 8..15
+ ldq $5, 16($17) # L : bytes 16..23
+ addq $7, 64, $7 # E : Update next wh64 address
+ nop # E :
+
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addq $17, 32, $17 # E : src += 32 bytes
+ stq $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stq $4, 8($16) # L : bytes 8..15
+ stq $5, 16($16) # L : bytes 16..23
+ subq $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stq $3, 24($16) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldq $6, 0($17) # L : bytes 0..7
+ ldq $4, 8($17) # L : bytes 8..15
+ cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+
+ ldq $5, 16($17) # L : bytes 16..23
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32
+ subq $18, 64, $18 # E : count -= 64
+
+ addq $17, 32, $17 # E : src += 32
+ stq $6, -32($16) # L : bytes 0..7
+ stq $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stq $5, -16($16) # L : bytes 16..23
+ stq $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body
+
+$tail_quads:
+$no_unroll:
+ .align 4
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $less_than_8 # U : Nope
+ nop # E :
+ nop # E :
+
+$move_a_quad:
+ ldq $1, 0($17) # L : fetch 8
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store 8
+ addq $16, 8, $16 # E : dest += 8
+ bge $18, $move_a_quad # U :
+ nop # E :
+
+$less_than_8:
+ .align 4
+ addq $18, 8, $18 # E : add back for trailing bytes
+ ble $18, $nomoredata # U : All-done
+ nop # E :
+ nop # E :
+
+ /* Trailing bytes */
+$tail_bytes:
+ subq $18, 1, $18 # E : count--
+ ldbu $1, 0($17) # L : fetch a byte
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store a byte
+ addq $16, 1, $16 # E : dest++
+ bgt $18, $tail_bytes # U : more to be done?
+ nop # E :
+
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$misaligned:
+ mov $0, $4 # E : dest temp
+ and $0, 7, $1 # E : dest alignment mod8
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
+ nop
+
+$aligndest:
+ ble $18, $nomoredata # U :
+ ldbu $1, 0($17) # L : fetch a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+
+ stb $1, 0($4) # L : store it
+ addq $4, 1, $4 # E : dest++
+ and $4, 7, $1 # E : dest 0mod8 yet?
+ bne $1, $aligndest # U : go until we are aligned.
+
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $misalign_tail # U : Nope
+ ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
+ nop # E :
+
+$mis_quad:
+ ldq_u $16, 8($17) # L : Fetch next 8
+ extql $3, $17, $3 # U : masking
+ extqh $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ stq $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addq $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad # U : More quads to move
+ nop
+ nop
+
+$misalign_tail:
+ addq $18, 8, $18 # E : account for tail stuff
+ ble $18, $nomoredata # U :
+ nop
+ nop
+
+$misalign_byte:
+ ldbu $1, 0($17) # L : fetch 1
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($4) # L : store
+ addq $4, 1, $4 # E : dest++
+ bgt $18, $misalign_byte # U : more to go?
+ nop
+
+
+$nomoredata:
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+END(memcpy)
diff --git a/sysdeps/alpha/alphaev6/memset.S b/sysdeps/alpha/alphaev6/memset.S
new file mode 100644
index 0000000000..363b3a588b
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/memset.S
@@ -0,0 +1,224 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+ENTRY(memset)
+#ifdef PROF
+ ldgp gp, 0(pv)
+ lda AT, _mcount
+ jsr AT, (AT), _mcount
+ .prologue 1
+#else
+ .prologue 0
+#endif
+
+ /*
+ * Serious stalling happens. The only way to mitigate this is to
+ * undertake a major re-write to interleave the constant materialization
+ * with other parts of the fall-through code. This is important, even
+ * though it makes maintenance tougher.
+ * Do this later.
+ */
+ and $17, 255, $1 # E : 00000000000000ch
+ insbl $17, 1, $2 # U : 000000000000ch00
+ mov $16, $0 # E : return value
+ ble $18, $end # U : zero length requested?
+
+ addq $18, $16, $6 # E : max address to write to
+ or $1, $2, $17 # E : 000000000000chch
+ insbl $1, 2, $3 # U : 0000000000ch0000
+ insbl $1, 3, $4 # U : 00000000ch000000
+
+ or $3, $4, $3 # E : 00000000chch0000
+ inswl $17, 4, $5 # U : 0000chch00000000
+ xor $16, $6, $1 # E : will complete write be within one quadword?
+ inswl $17, 6, $2 # U : chch000000000000
+
+ or $17, $3, $17 # E : 00000000chchchch
+ or $2, $5, $2 # E : chchchch00000000
+ bic $1, 7, $1 # E : fit within a single quadword?
+ and $16, 7, $3 # E : Target addr misalignment
+
+ or $17, $2, $17 # E : chchchchchchchch
+ beq $1, $within_quad # U :
+ nop # E :
+ beq $3, $aligned # U : target is 0mod8
+
+ /*
+ * Target address is misaligned, and won't fit within a quadword.
+ */
+ ldq_u $4, 0($16) # L : Fetch first partial
+ mov $16, $5 # E : Save the address
+ insql $17, $16, $2 # U : Insert new bytes
+ subq $3, 8, $3 # E : Invert (for addressing uses)
+
+ addq $18, $3, $18 # E : $18 is new count ($3 is negative)
+ mskql $4, $16, $4 # U : clear relevant parts of the quad
+ subq $16, $3, $16 # E : $16 is new aligned destination
+ or $2, $4, $1 # E : Final bytes
+
+ nop
+ stq_u $1,0($5) # L : Store result
+ nop
+ nop
+
+ .align 4
+$aligned:
+ /*
+ * We are now guaranteed to be quad aligned, with at least
+ * one partial quad to write.
+ */
+
+ sra $18, 3, $3 # U : Number of remaining quads to write
+ and $18, 7, $18 # E : Number of trailing bytes to write
+ mov $16, $5 # E : Save dest address
+ beq $3, $no_quad # U : tail stuff only
+
+ /*
+ * It's worth the effort to unroll this and use wh64 if possible.
+ * At this point, entry values are:
+ * $16 Current destination address
+ * $5 A copy of $16
+ * $6 The max quadword address to write to
+ * $18 Number trailer bytes
+ * $3 Number quads to write
+ */
+
+ and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
+ subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
+ subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
+ blt $4, $loop # U :
+
+ /*
+ * We know we've got at least 16 quads, minimum of one trip
+ * through unrolled loop. Do a quad at a time to get us 0mod64
+ * aligned.
+ */
+
+ nop # E :
+ nop # E :
+ nop # E :
+ beq $1, $bigalign # U :
+
+$alignmod64:
+ stq $17, 0($5) # L :
+ subq $3, 1, $3 # E : For consistency later
+ addq $1, 8, $1 # E : Increment towards zero for alignment
+ addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
+
+ nop
+ nop
+ addq $5, 8, $5 # E : Inc address
+ blt $1, $alignmod64 # U :
+
+$bigalign:
+ /*
+ * $3 - number quads left to go
+ * $5 - target address (aligned 0mod64)
+ * $17 - mask of stuff to store
+ * Scratch registers available: $7, $2, $4, $1
+ * We know that we'll be taking a minimum of one trip through.
+ * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
+ * Assumes the wh64 needs to be for 2 trips through the loop in the future.
+ * The wh64 is issued on for the starting destination address for trip +2
+ * through the loop, and if there are less than two trips left, the target
+ * address will be for the current trip.
+ */
+
+$do_wh64:
+ wh64 ($4) # L1 : memory subsystem write hint
+ subq $3, 24, $2 # E : For determining future wh64 addresses
+ stq $17, 0($5) # L :
+ nop # E :
+
+ addq $5, 128, $4 # E : speculative target of next wh64
+ stq $17, 8($5) # L :
+ stq $17, 16($5) # L :
+ addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
+
+ stq $17, 24($5) # L :
+ stq $17, 32($5) # L :
+ cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
+ nop
+
+ stq $17, 40($5) # L :
+ stq $17, 48($5) # L :
+ subq $3, 16, $2 # E : Repeat the loop at least once more?
+ nop
+
+ stq $17, 56($5) # L :
+ addq $5, 64, $5 # E :
+ subq $3, 8, $3 # E :
+ bge $2, $do_wh64 # U :
+
+ nop
+ nop
+ nop
+ beq $3, $no_quad # U : Might have finished already
+
+ .align 4
+ /*
+ * Simple loop for trailing quadwords, or for small amounts
+ * of data (where we can't use an unrolled loop and wh64)
+ */
+$loop:
+ stq $17, 0($5) # L :
+ subq $3, 1, $3 # E : Decrement number quads left
+ addq $5, 8, $5 # E : Inc address
+ bne $3, $loop # U : more?
+
+$no_quad:
+ /*
+ * Write 0..7 trailing bytes.
+ */
+ nop # E :
+ beq $18, $end # U : All done?
+ ldq $7, 0($5) # L :
+ mskqh $7, $6, $2 # U : Mask final quad
+
+ insqh $17, $6, $4 # U : New bits
+ or $2, $4, $1 # E : Put it all together
+ stq $1, 0($5) # L : And back to memory
+ ret $31,($26),1 # L0 :
+
+$within_quad:
+ ldq_u $1, 0($16) # L :
+ insql $17, $16, $2 # U : New bits
+ mskql $1, $16, $4 # U : Clear old
+ or $2, $4, $2 # E : New result
+
+ mskql $2, $6, $4 # U :
+ mskqh $1, $6, $2 # U :
+ or $2, $4, $1 # E :
+ stq_u $1, 0($16) # L :
+
+$end:
+ nop
+ nop
+ nop
+ ret $31,($26),1 # L0 :
+
+ END(memset)
diff --git a/sysdeps/alpha/alphaev6/stxcpy.S b/sysdeps/alpha/alphaev6/stxcpy.S
new file mode 100644
index 0000000000..0df20438fc
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/stxcpy.S
@@ -0,0 +1,329 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+/* Copy a null-terminated string from SRC to DST.
+
+ This is an internal routine used by strcpy, stpcpy, and strcat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+
+ On output:
+ t8 = bitmask (with one bit set) indicating the last byte written
+ a0 = unaligned address of the last *word* written
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+ .text
+
+/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
+ doesn't like putting the entry point for a procedure somewhere in the
+ middle of the procedure descriptor. Work around this by putting the
+ aligned copy in its own procedure descriptor */
+
+
+ .ent stxcpy_aligned
+ .align 4
+stxcpy_aligned:
+ .frame sp, 0, t9
+ .prologue 0
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ lda t2, -1 # E : build a mask against false zero
+ mskqh t2, a1, t2 # U : detection in the src word (stall)
+ mskqh t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mskql t0, a1, t0 # U : assemble the first output word
+ cmpbge zero, t2, t8 # E : bits set iff null found
+ or t0, t3, t1 # E : (stall)
+ bne t8, $a_eos # U : (stall)
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == a source word not containing a null. */
+ /* Nops here to separate store quads from load quads */
+
+$a_loop:
+ stq_u t1, 0(a0) # L :
+ addq a0, 8, a0 # E :
+ nop
+ nop
+
+ ldq_u t1, 0(a1) # L : Latency=3
+ addq a1, 8, a1 # E :
+ cmpbge zero, t1, t8 # E : (3 cycle stall)
+ beq t8, $a_loop # U : (stall for t8)
+
+ /* Take care of the final (partial) word store.
+ On entry to this basic block we have:
+ t1 == the source word containing the null
+ t8 == the cmpbge mask that found it. */
+$a_eos:
+ negq t8, t6 # E : find low bit set
+ and t8, t6, t10 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t10, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldq_u t0, 0(a0) # L : Latency=3
+ subq t10, 1, t6 # E :
+ zapnot t1, t6, t1 # U : clear src bytes >= null (stall)
+ or t10, t6, t8 # E : (stall)
+
+ zap t0, t8, t0 # E : clear dst bytes <= null
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stq_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ .end stxcpy_aligned
+
+ .align 4
+ .ent __stxcpy
+ .globl __stxcpy
+__stxcpy:
+ .frame sp, 0, t9
+ .prologue 0
+
+ /* Are source and destination co-aligned? */
+ xor a0, a1, t0 # E :
+ unop # E :
+ and t0, 7, t0 # E : (stall)
+ bne t0, $unaligned # U : (stall)
+
+ /* We are co-aligned; take care of a partial first word. */
+ ldq_u t1, 0(a1) # L : load first src word
+ and a0, 7, t0 # E : take care not to load a word ...
+ addq a1, 8, a1 # E :
+ beq t0, stxcpy_aligned # U : ... if we wont need it (stall)
+
+ ldq_u t0, 0(a0) # L :
+ br stxcpy_aligned # L0 : Latency=3
+ nop
+ nop
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, for masking back in, if needed else 0
+ t1 == the low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldq_u t2, 8(a1) # L :
+ addq a1, 8, a1 # E :
+ extql t1, a1, t1 # U : (stall on a1)
+ extqh t2, a1, t4 # U : (stall on a1)
+
+ mskql t0, a0, t0 # U :
+ or t1, t4, t1 # E :
+ mskqh t1, a0, t1 # U : (stall on t1)
+ or t0, t1, t1 # E : (stall on t1)
+
+ or t1, t6, t6 # E :
+ cmpbge zero, t6, t8 # E : (stall)
+ lda t6, -1 # E : for masking just below
+ bne t8, $u_final # U : (stall)
+
+ mskql t6, a1, t6 # U : mask out the bits we have
+ or t6, t2, t2 # E : already extracted before (stall)
+ cmpbge zero, t2, t8 # E : testing eos (stall)
+ bne t8, $u_late_head_exit # U : (stall)
+
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+
+ stq_u t1, 0(a0) # L : store first output word
+ addq a0, 8, a0 # E :
+ extql t2, a1, t0 # U : position ho-bits of lo word
+ ldq_u t2, 8(a1) # U : read next high-order source word
+
+ addq a1, 8, a1 # E :
+ cmpbge zero, t2, t8 # E : (stall for t2)
+ nop # E :
+ bne t8, $u_eos # U : (stall)
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 3
+$u_loop:
+ extqh t2, a1, t1 # U : extract high bits for current word
+ addq a1, 8, a1 # E : (stall)
+ extql t2, a1, t3 # U : extract low bits for next time (stall)
+ addq a0, 8, a0 # E :
+
+ or t0, t1, t1 # E : current dst word now complete
+ ldq_u t2, 0(a1) # L : Latency=3 load high word for next time
+ stq_u t1, -8(a0) # L : save the current word (stall)
+ mov t3, t0 # E :
+
+ cmpbge zero, t2, t8 # E : test new word for eos
+ beq t8, $u_loop # U : (stall)
+ nop
+ nop
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t0 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ extqh t2, a1, t1 # U :
+ or t0, t1, t1 # E : first (partial) source word complete (stall)
+ cmpbge zero, t1, t8 # E : is the null in this first bit? (stall)
+ bne t8, $u_final # U : (stall)
+
+$u_late_head_exit:
+ stq_u t1, 0(a0) # L : the null was in the high-order bits
+ addq a0, 8, a0 # E :
+ extql t2, a1, t1 # U :
+ cmpbge zero, t1, t8 # E : (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t1 == assembled source word
+ t8 == cmpbge mask that found the null. */
+$u_final:
+ negq t8, t6 # E : isolate low bit set
+ and t6, t8, t10 # E : (stall)
+ and t10, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldq_u t0, 0(a0) # E :
+ subq t10, 1, t6 # E :
+ or t6, t10, t8 # E : (stall)
+ zapnot t1, t6, t1 # U : kill source bytes >= null (stall)
+
+ zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall)
+ or t0, t1, t1 # E : (stall)
+ nop
+ nop
+
+1: stq_u t1, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldq_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldq_u t0, 0(a0) # L :
+ lda t6, -1 # E :
+
+ mskql t6, a0, t6 # U :
+ nop
+ nop
+ nop
+1:
+ subq a1, t4, a1 # E : sub dest misalignment from src addr
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+ cmplt t4, t5, t10 # E :
+ beq t10, $u_head # U :
+ lda t2, -1 # E : mask out leading garbage in source
+
+ mskqh t2, t5, t2 # U :
+ ornot t1, t2, t3 # E : (stall)
+ cmpbge zero, t3, t8 # E : is there a zero? (stall)
+ beq t8, $u_head # U : (stall)
+
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+
+ ldq_u t0, 0(a0) # L :
+ negq t8, t6 # E : build bitmask of bytes <= zero
+ and t6, t8, t10 # E : (stall)
+ and a1, 7, t5 # E :
+
+ subq t10, 1, t6 # E :
+ or t6, t10, t8 # E : (stall)
+ srl t10, t5, t10 # U : adjust final null return value
+ zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall)
+
+ and t1, t2, t1 # E : to source validity mask
+ extql t2, a1, t2 # U :
+ extql t1, a1, t1 # U : (stall)
+ andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall)
+
+ or t0, t1, t1 # e1 : and put it there
+ stq_u t1, 0(a0) # .. e0 : (stall)
+ ret (t9) # e1 :
+ nop
+
+ .end __stxcpy
+
diff --git a/sysdeps/alpha/alphaev6/stxncpy.S b/sysdeps/alpha/alphaev6/stxncpy.S
new file mode 100644
index 0000000000..140279106a
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/stxncpy.S
@@ -0,0 +1,405 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ Contributed by Richard Henderson (rth@tamu.edu)
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+/* Copy no more than COUNT bytes of the null-terminated string from
+ SRC to DST.
+
+ This is an internal routine used by strncpy, stpncpy, and strncat.
+ As such, it uses special linkage conventions to make implementation
+ of these public functions more efficient.
+
+ On input:
+ t9 = return address
+ a0 = DST
+ a1 = SRC
+ a2 = COUNT
+
+ Furthermore, COUNT may not be zero.
+
+ On output:
+ t0 = last word written
+ t8 = bitmask (with one bit set) indicating the last byte written
+ t10 = bitmask (with one bit set) indicating the byte position of
+ the end of the range specified by COUNT
+ a0 = unaligned address of the last *word* written
+ a2 = the number of full words left in COUNT
+
+ Furthermore, v0, a3-a5, t11, and t12 are untouched.
+*/
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noat
+ .set noreorder
+
+/* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
+ doesn't like putting the entry point for a procedure somewhere in the
+ middle of the procedure descriptor. Work around this by putting the
+ aligned copy in its own procedure descriptor */
+
+
+ .ent stxncpy_aligned
+ .align 4
+stxncpy_aligned:
+ .frame sp, 0, t9, 0
+ .prologue 0
+
+ /* On entry to this basic block:
+ t0 == the first destination word for masking back in
+ t1 == the first source word. */
+
+ /* Create the 1st output word and detect 0's in the 1st input word. */
+ lda t2, -1 # E : build a mask against false zero
+ mskqh t2, a1, t2 # U : detection in the src word (stall)
+ mskqh t1, a1, t3 # U :
+ ornot t1, t2, t2 # E : (stall)
+
+ mskql t0, a1, t0 # U : assemble the first output word
+ cmpbge zero, t2, t7 # E : bits set iff null found
+ or t0, t3, t0 # E : (stall)
+ beq a2, $a_eoc # U :
+
+ bne t7, $a_eos # U :
+ nop
+ nop
+ nop
+
+ /* On entry to this basic block:
+ t0 == a source word not containing a null. */
+
+ /*
+ * nops here to:
+ * separate store quads from load quads
+ * limit of 1 bcond/quad to permit training
+ */
+$a_loop:
+ stq_u t0, 0(a0) # L :
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+ nop
+
+ ldq_u t0, 0(a1) # L :
+ addq a1, 8, a1 # E :
+ cmpbge zero, t0, t7 # E :
+ beq a2, $a_eoc # U :
+
+ beq t7, $a_loop # U :
+ nop
+ nop
+ nop
+
+ /* Take care of the final (partial) word store. At this point
+ the end-of-count bit is set in t7 iff it applies.
+
+ On entry to this basic block we have:
+ t0 == the source word containing the null
+ t7 == the cmpbge mask that found it. */
+
+$a_eos:
+ negq t7, t8 # E : find low bit set
+ and t7, t8, t8 # E : (stall)
+ /* For the sake of the cache, don't read a destination word
+ if we're not going to need it. */
+ and t8, 0x80, t6 # E : (stall)
+ bne t6, 1f # U : (stall)
+
+ /* We're doing a partial word store and so need to combine
+ our source and original destination words. */
+ ldq_u t1, 0(a0) # L :
+ subq t8, 1, t6 # E :
+ or t8, t6, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : clear src bytes > null (stall)
+
+ zap t1, t7, t1 # .. e1 : clear dst bytes <= null
+ or t0, t1, t0 # e1 : (stall)
+ nop
+ nop
+
+1: stq_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+
+ /* Add the end-of-count bit to the eos detection bitmask. */
+$a_eoc:
+ or t10, t7, t7 # E :
+ br $a_eos # L0 : Latency=3
+ nop
+ nop
+
+ .end stxncpy_aligned
+
+ .align 4
+ .ent __stxncpy
+ .globl __stxncpy
+__stxncpy:
+ .frame sp, 0, t9, 0
+ .prologue 0
+
+ /* Are source and destination co-aligned? */
+ xor a0, a1, t1 # E :
+ and a0, 7, t0 # E : find dest misalignment
+ and t1, 7, t1 # E : (stall)
+ addq a2, t0, a2 # E : bias count by dest misalignment (stall)
+
+ subq a2, 1, a2 # E :
+ and a2, 7, t2 # E : (stall)
+ srl a2, 3, a2 # U : a2 = loop counter = (count - 1)/8 (stall)
+ addq zero, 1, t10 # E :
+
+ sll t10, t2, t10 # U : t10 = bitmask of last count byte
+ bne t1, $unaligned # U :
+ /* We are co-aligned; take care of a partial first word. */
+ ldq_u t1, 0(a1) # L : load first src word
+ addq a1, 8, a1 # E :
+
+ beq t0, stxncpy_aligned # U : avoid loading dest word if not needed
+ ldq_u t0, 0(a0) # L :
+ nop
+ nop
+
+ br stxncpy_aligned # .. e1 :
+ nop
+ nop
+ nop
+
+
+
+/* The source and destination are not co-aligned. Align the destination
+ and cope. We have to be very careful about not reading too much and
+ causing a SEGV. */
+
+ .align 4
+$u_head:
+ /* We know just enough now to be able to assemble the first
+ full source word. We can still find a zero at the end of it
+ that prevents us from outputting the whole thing.
+
+ On entry to this basic block:
+ t0 == the first dest word, unmasked
+ t1 == the shifted low bits of the first source word
+ t6 == bytemask that is -1 in dest word bytes */
+
+ ldq_u t2, 8(a1) # L : Latency=3 load second src word
+ addq a1, 8, a1 # E :
+ mskql t0, a0, t0 # U : mask trailing garbage in dst
+ extqh t2, a1, t4 # U : (3 cycle stall on t2)
+
+ or t1, t4, t1 # E : first aligned src word complete (stall)
+ mskqh t1, a0, t1 # U : mask leading garbage in src (stall)
+ or t0, t1, t0 # E : first output word complete (stall)
+ or t0, t6, t6 # E : mask original data for zero test (stall)
+
+ cmpbge zero, t6, t7 # E :
+ beq a2, $u_eocfin # U :
+ nop
+ nop
+
+ bne t7, $u_final # U :
+ lda t6, -1 # E : mask out the bits we have
+ mskql t6, a1, t6 # U : already seen (stall)
+ stq_u t0, 0(a0) # L : store first output word
+
+ or t6, t2, t2 # E :
+ cmpbge zero, t2, t7 # E : find nulls in second partial (stall)
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+
+ bne t7, $u_late_head_exit # U :
+ /* Finally, we've got all the stupid leading edge cases taken care
+ of and we can set up to enter the main loop. */
+ extql t2, a1, t1 # U : position hi-bits of lo word
+ ldq_u t2, 8(a1) # L : read next high-order source word
+ addq a1, 8, a1 # E :
+
+ cmpbge zero, t2, t7 # E : (stall)
+ beq a2, $u_eoc # U :
+ nop
+ nop
+
+ bne t7, $u_eos # e1 :
+ nop
+ nop
+ nop
+
+ /* Unaligned copy main loop. In order to avoid reading too much,
+ the loop is structured to detect zeros in aligned source words.
+ This has, unfortunately, effectively pulled half of a loop
+ iteration out into the head and half into the tail, but it does
+ prevent nastiness from accumulating in the very thing we want
+ to run as fast as possible.
+
+ On entry to this basic block:
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word
+
+ We further know that t2 does not contain a null terminator. */
+
+ .align 4
+$u_loop:
+ extqh t2, a1, t0 # U : extract high bits for current word
+ addq a1, 8, a1 # E :
+ extql t2, a1, t3 # U : extract low bits for next time
+ addq a0, 8, a0 # E :
+
+ or t0, t1, t0 # E : current dst word now complete
+ ldq_u t2, 0(a1) # U : Latency=3 load high word for next time
+ stq_u t0, -8(a0) # U : save the current word (stall)
+ mov t3, t1 # E :
+
+ subq a2, 1, a2 # E :
+ cmpbge zero, t2, t7 # E : test new word for eos (2 cycle stall for data)
+ beq a2, $u_eoc # U : (stall)
+ nop
+
+ beq t7, $u_loop # U :
+ nop
+ nop
+ nop
+
+ /* We've found a zero somewhere in the source word we just read.
+ If it resides in the lower half, we have one (probably partial)
+ word to write out, and if it resides in the upper half, we
+ have one full and one partial word left to write out.
+
+ On entry to this basic block:
+ t1 == the shifted high-order bits from the previous source word
+ t2 == the unshifted current source word. */
+$u_eos:
+ extqh t2, a1, t0 # U :
+ or t0, t1, t0 # E : first (partial) source word complete (stall)
+ cmpbge zero, t0, t7 # E : is the null in this first bit? (stall)
+ bne t7, $u_final # U : (stall)
+
+ stq_u t0, 0(a0) # L : the null was in the high-order bits
+ addq a0, 8, a0 # E :
+ subq a2, 1, a2 # E :
+ nop
+
+$u_late_head_exit:
+ extql t2, a1, t0 # U :
+ cmpbge zero, t0, t7 # E :
+ or t7, t10, t6 # E : (stall)
+ cmoveq a2, t6, t7 # E : Latency=2, extra map slot (stall)
+
+ /* Take care of a final (probably partial) result word.
+ On entry to this basic block:
+ t0 == assembled source word
+ t7 == cmpbge mask that found the null. */
+$u_final:
+ negq t7, t6 # E : isolate low bit set
+ and t6, t7, t8 # E : (stall)
+ and t8, 0x80, t6 # E : avoid dest word load if we can (stall)
+ bne t6, 1f # U : (stall)
+
+ ldq_u t1, 0(a0) # L :
+ subq t8, 1, t6 # E :
+ or t6, t8, t7 # E : (stall)
+ zapnot t0, t7, t0 # U : kill source bytes > null
+
+ zap t1, t7, t1 # U : kill dest bytes <= null
+ or t0, t1, t0 # E : (stall)
+ nop
+ nop
+
+1: stq_u t0, 0(a0) # L :
+ ret (t9) # L0 : Latency=3
+
+$u_eoc: # end-of-count
+ extqh t2, a1, t0 # U :
+ or t0, t1, t0 # E : (stall)
+ cmpbge zero, t0, t7 # E : (stall)
+ nop
+
+$u_eocfin: # end-of-count, final word
+ or t10, t7, t7 # E :
+ br $u_final # L0 : Latency=3
+ nop
+ nop
+
+ /* Unaligned copy entry point. */
+ .align 4
+$unaligned:
+
+ ldq_u t1, 0(a1) # L : load first source word
+ and a0, 7, t4 # E : find dest misalignment
+ and a1, 7, t5 # E : find src misalignment
+ /* Conditionally load the first destination word and a bytemask
+ with 0xff indicating that the destination byte is sacrosanct. */
+ mov zero, t0 # E :
+
+ mov zero, t6 # E :
+ beq t4, 1f # U :
+ ldq_u t0, 0(a0) # L :
+ lda t6, -1 # E :
+
+ mskql t6, a0, t6 # U :
+ nop
+ nop
+ nop
+1:
+ subq a1, t4, a1 # E : sub dest misalignment from src addr
+
+ /* If source misalignment is larger than dest misalignment, we need
+ extra startup checks to avoid SEGV. */
+
+ cmplt t4, t5, t8 # E :
+ extql t1, a1, t1 # U : shift src into place
+ lda t2, -1 # E : for creating masks later
+ beq t8, $u_head # U : (stall)
+
+ mskqh t2, t5, t2 # U : begin src byte validity mask
+ cmpbge zero, t1, t7 # E : is there a zero?
+ extql t2, a1, t2 # U :
+ or t7, t10, t5 # E : test for end-of-count too
+
+ cmpbge zero, t2, t3 # E :
+ cmoveq a2, t5, t7 # E : Latency=2, extra map slot
+ nop # E : keep with cmoveq
+ andnot t7, t3, t7 # E : (stall)
+
+ beq t7, $u_head # U :
+ /* At this point we've found a zero in the first partial word of
+ the source. We need to isolate the valid source data and mask
+ it into the original destination data. (Incidentally, we know
+ that we'll need at least one byte of that original dest word.) */
+ ldq_u t0, 0(a0) # L :
+ negq t7, t6 # E : build bitmask of bytes <= zero
+ mskqh t1, t4, t1 # U :
+
+ and t6, t7, t8 # E :
+ subq t8, 1, t6 # E : (stall)
+ or t6, t8, t7 # E : (stall)
+ zapnot t2, t7, t2 # U : prepare source word; mirror changes (stall)
+
+ zapnot t1, t7, t1 # U : to source validity mask
+ andnot t0, t2, t0 # E : zero place for source to reside
+ or t0, t1, t0 # E : and put it there (stall both t0, t1)
+ stq_u t0, 0(a0) # L : (stall)
+
+ ret (t9) # L0 : Latency=3
+ nop
+ nop
+ nop
+
+ .end __stxncpy
+