aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/alpha/alphaev6/memcpy.S
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@redhat.com>2000-12-08 17:27:11 +0000
committerUlrich Drepper <drepper@redhat.com>2000-12-08 17:27:11 +0000
commit104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c (patch)
tree211aa48f0dbfe5720f4fadaf86e33f9a7dbb5dc8 /sysdeps/alpha/alphaev6/memcpy.S
parent4e9b4067d74d2d943de6918f39c15eb8f8f11b22 (diff)
downloadglibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar.gz
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.tar.bz2
glibc-104d0bd3ef1ea7a97ae7b3e4a8a63a08b92cfc1c.zip
Update.
2000-12-08 Jakub Jelinek <jakub@redhat.com> * elf/dl-load.c (fillin_rpath): Don't assume there is '\0' at cp + len. Compute where from dirname. Reported by <jreiser@BitWagon.com>. 2000-12-08 Richard Henderson <rth@twiddle.net> * sysdeps/alpha/_mcount.S: Fix typo. * sysdeps/alpha/strncpy.S: Likewise. * sysdeps/alpha/alphaev6/Implies: New file. * sysdeps/alpha/alphaev67/Implies: New file. * sysdeps/alpha/alphaev67/ffs.S: New file. * sysdeps/alpha/alphaev67/ffsll.S: New file. * sysdeps/alpha/alphaev67/rawmemchr.S: New file. * sysdeps/alpha/alphaev67/stpcpy.S: New file. * sysdeps/alpha/alphaev67/stpncpy.S: New file. * sysdeps/alpha/rawmemchr.S: New file. * sysdeps/alpha/strcat.S: Tail call to __stxcpy. * sysdeps/alpha/strcpy.S: Likewise. From GMP 3.1.1: * sysdeps/alpha/alphaev6/addmul_1.s: New file. From rick.gorton@alpha-processor.com: * sysdeps/alpha/alphaev6/memchr.S: New file. * sysdeps/alpha/alphaev6/memcpy.S: New file. * sysdeps/alpha/alphaev6/memset.S: New file. * sysdeps/alpha/alphaev6/stxcpy.S: New file. * sysdeps/alpha/alphaev6/stxncpy.S: New file. * sysdeps/alpha/alphaev67/strcat.S: New file. * sysdeps/alpha/alphaev67/strchr.S: New file. * sysdeps/alpha/alphaev67/strlen.S: New file. * sysdeps/alpha/alphaev67/strncat.S: New file. * sysdeps/alpha/htonl.S: Use a shorter sequence. 2000-12-08 Jakub Jelinek <jakub@redhat.com> * inet/getnameinfo.c (getnameinfo): Fix NI_NOFQDN support. Reported by <pspencer@fields.utoronto.ca>. 2000-12-07 Jes Sorensen <jes@linuxcare.com> * sysdeps/ia64/elf/start.S (__data_start): Add __data_start variable. Pointed out by Hans Boehm. 2000-12-07 H.J. Lu <hjl@gnu.org> * elf/dl-version.c (match_symbol): Check map->l_name[0] for printing. 2000-12-07 Andreas Jaeger <aj@suse.de> * misc/error.c: Add format attributes for __error and __error_at_line. * nscd/dbg_log.h: Add format attribute. 2000-12-08 Ulrich Drepper <drepper@redhat.com> * misc/sys/syslog.h: Add format attributes to syslog and vsyslog. Patch by Joseph S. Myers <jsm28@cam.ac.uk>. * sysdeps/generic/dl-cache.c (HWCAP_CHECK): Fix access to _dl_hwcap. * manual/install.texi: Recommend to set LANGUAGE=C LC_ALL=C during
Diffstat (limited to 'sysdeps/alpha/alphaev6/memcpy.S')
-rw-r--r--sysdeps/alpha/alphaev6/memcpy.S254
1 files changed, 254 insertions, 0 deletions
diff --git a/sysdeps/alpha/alphaev6/memcpy.S b/sysdeps/alpha/alphaev6/memcpy.S
new file mode 100644
index 0000000000..35f17e7f9e
--- /dev/null
+++ b/sysdeps/alpha/alphaev6/memcpy.S
@@ -0,0 +1,254 @@
+/* Copyright (C) 2000 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with the GNU C Library; see the file COPYING.LIB. If not,
+ write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA. */
+
+/*
+ * Much of the information about 21264 scheduling/coding comes from:
+ * Compiler Writer's Guide for the Alpha 21264
+ * abbreviated as 'CWG' in other comments here
+ * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+ * Scheduling notation:
+ * E - either cluster
+ * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
+ * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
+ *
+ * Temp usage notes:
+ * $0 - destination address
+ * $1,$2, - scratch
+ */
+
+#include <sysdep.h>
+
+ .arch ev6
+ .set noreorder
+ .set noat
+
+ENTRY(memcpy)
+
+ mov $16, $0 # E : copy dest to return
+ ble $18, $nomoredata # U : done with the copy?
+ xor $16, $17, $1 # E : are source and dest alignments the same?
+ and $1, 7, $1 # E : are they the same mod 8?
+
+ bne $1, $misaligned # U : Nope - gotta do this the slow way
+ /* source and dest are same mod 8 address */
+ and $16, 7, $1 # E : Are both 0mod8?
+ beq $1, $both_0mod8 # U : Yes
+ nop # E :
+
+ /*
+ * source and dest are same misalignment. move a byte at a time
+ * until a 0mod8 alignment for both is reached.
+ * At least one byte more to move
+ */
+
+$head_align:
+ ldbu $1, 0($17) # L : grab a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ stb $1, 0($16) # L :
+ addq $16, 1, $16 # E : dest++
+ and $16, 7, $1 # E : Are we at 0mod8 yet?
+ ble $18, $nomoredata # U : done with the copy?
+ bne $1, $head_align # U :
+
+$both_0mod8:
+ cmple $18, 127, $1 # E : Can we unroll the loop?
+ bne $1, $no_unroll # U :
+ and $16, 63, $1 # E : get mod64 alignment
+ beq $1, $do_unroll # U : no single quads to fiddle
+
+$single_head_quad:
+ ldq $1, 0($17) # L : get 8 bytes
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store
+ addq $16, 8, $16 # E : dest += 8
+ and $16, 63, $1 # E : get mod64 alignment
+ bne $1, $single_head_quad # U : still not fully aligned
+
+$do_unroll:
+ addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
+ cmple $18, 63, $1 # E : Can we go through the unrolled loop?
+ bne $1, $tail_quads # U : Nope
+ nop # E :
+
+$unroll_body:
+ wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
+ # ($7) are about to be over-written
+ ldq $6, 0($17) # L0 : bytes 0..7
+ nop # E :
+ nop # E :
+
+ ldq $4, 8($17) # L : bytes 8..15
+ ldq $5, 16($17) # L : bytes 16..23
+ addq $7, 64, $7 # E : Update next wh64 address
+ nop # E :
+
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 64, $1 # E : fallback value for wh64
+ nop # E :
+ nop # E :
+
+ addq $17, 32, $17 # E : src += 32 bytes
+ stq $6, 0($16) # L : bytes 0..7
+ nop # E :
+ nop # E :
+
+ stq $4, 8($16) # L : bytes 8..15
+ stq $5, 16($16) # L : bytes 16..23
+ subq $18, 192, $2 # E : At least two more trips to go?
+ nop # E :
+
+ stq $3, 24($16) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32 bytes
+ nop # E :
+ nop # E :
+
+ ldq $6, 0($17) # L : bytes 0..7
+ ldq $4, 8($17) # L : bytes 8..15
+ cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
+ # fallback wh64 address if < 2 more trips
+ nop # E :
+
+ ldq $5, 16($17) # L : bytes 16..23
+ ldq $3, 24($17) # L : bytes 24..31
+ addq $16, 32, $16 # E : dest += 32
+ subq $18, 64, $18 # E : count -= 64
+
+ addq $17, 32, $17 # E : src += 32
+ stq $6, -32($16) # L : bytes 0..7
+ stq $4, -24($16) # L : bytes 8..15
+ cmple $18, 63, $1 # E : At least one more trip?
+
+ stq $5, -16($16) # L : bytes 16..23
+ stq $3, -8($16) # L : bytes 24..31
+ nop # E :
+ beq $1, $unroll_body
+
+$tail_quads:
+$no_unroll:
+ .align 4
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $less_than_8 # U : Nope
+ nop # E :
+ nop # E :
+
+$move_a_quad:
+ ldq $1, 0($17) # L : fetch 8
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ nop # E :
+
+ stq $1, 0($16) # L : store 8
+ addq $16, 8, $16 # E : dest += 8
+ bge $18, $move_a_quad # U :
+ nop # E :
+
+$less_than_8:
+ .align 4
+ addq $18, 8, $18 # E : add back for trailing bytes
+ ble $18, $nomoredata # U : All-done
+ nop # E :
+ nop # E :
+
+ /* Trailing bytes */
+$tail_bytes:
+ subq $18, 1, $18 # E : count--
+ ldbu $1, 0($17) # L : fetch a byte
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($16) # L : store a byte
+ addq $16, 1, $16 # E : dest++
+ bgt $18, $tail_bytes # U : more to be done?
+ nop # E :
+
+ /* branching to exit takes 3 extra cycles, so replicate exit here */
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+$misaligned:
+ mov $0, $4 # E : dest temp
+ and $0, 7, $1 # E : dest alignment mod8
+ beq $1, $dest_0mod8 # U : life doesnt totally suck
+ nop
+
+$aligndest:
+ ble $18, $nomoredata # U :
+ ldbu $1, 0($17) # L : fetch a byte
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+
+ stb $1, 0($4) # L : store it
+ addq $4, 1, $4 # E : dest++
+ and $4, 7, $1 # E : dest 0mod8 yet?
+ bne $1, $aligndest # U : go until we are aligned.
+
+ /* Source has unknown alignment, but dest is known to be 0mod8 */
+$dest_0mod8:
+ subq $18, 8, $18 # E : At least a quad left?
+ blt $18, $misalign_tail # U : Nope
+ ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
+ nop # E :
+
+$mis_quad:
+ ldq_u $16, 8($17) # L : Fetch next 8
+ extql $3, $17, $3 # U : masking
+ extqh $16, $17, $1 # U : masking
+ bis $3, $1, $1 # E : merged bytes to store
+
+ subq $18, 8, $18 # E : count -= 8
+ addq $17, 8, $17 # E : src += 8
+ stq $1, 0($4) # L : store 8 (aligned)
+ mov $16, $3 # E : "rotate" source data
+
+ addq $4, 8, $4 # E : dest += 8
+ bge $18, $mis_quad # U : More quads to move
+ nop
+ nop
+
+$misalign_tail:
+ addq $18, 8, $18 # E : account for tail stuff
+ ble $18, $nomoredata # U :
+ nop
+ nop
+
+$misalign_byte:
+ ldbu $1, 0($17) # L : fetch 1
+ subq $18, 1, $18 # E : count--
+ addq $17, 1, $17 # E : src++
+ nop # E :
+
+ stb $1, 0($4) # L : store
+ addq $4, 1, $4 # E : dest++
+ bgt $18, $misalign_byte # U : more to go?
+ nop
+
+
+$nomoredata:
+ ret $31, ($26), 1 # L0 :
+ nop # E :
+ nop # E :
+ nop # E :
+
+END(memcpy)