aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power7
diff options
context:
space:
mode:
authorZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
committerZack Weinberg <zackw@panix.com>2017-06-08 15:39:03 -0400
commit5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree4470480d904b65cf14ca524f96f79eca818c3eaf /sysdeps/powerpc/powerpc64/power7
parent199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
downloadglibc-zack/build-layout-experiment.tar
glibc-zack/build-layout-experiment.tar.gz
glibc-zack/build-layout-experiment.tar.bz2
glibc-zack/build-layout-experiment.zip
Prepare for radical source tree reorganization.zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Makefile11
-rw-r--r--sysdeps/powerpc/powerpc64/power7/add_n.S98
-rw-r--r--sysdeps/powerpc/powerpc64/power7/bcopy.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S70
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S69
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S68
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memchr.S199
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcmp.S1061
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S430
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memmove.S835
-rw-r--r--sysdeps/powerpc/powerpc64/power7/mempcpy.S472
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memrchr.S201
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S399
-rw-r--r--sysdeps/powerpc/powerpc64/power7/multiarch/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/rawmemchr.S115
-rw-r--r--sysdeps/powerpc/powerpc64/power7/stpncpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp.S126
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strchr.S230
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strchrnul.S131
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcmp.S168
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strlen.S107
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncmp.S227
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncpy.S722
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strnlen.S182
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strrchr.S260
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c27
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strstr.S521
-rw-r--r--sysdeps/powerpc/powerpc64/power7/sub_n.S23
38 files changed, 0 insertions, 6793 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies
deleted file mode 100644
index 9d68f39d22..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/Implies
+++ /dev/null
@@ -1,2 +0,0 @@
-powerpc/powerpc64/power6/fpu
-powerpc/powerpc64/power6
diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile
deleted file mode 100644
index 89a2296085..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-ifeq ($(subdir),elf)
-# Prevent the use of VSX registers and insns in _dl_start, which under -O3
-# optimization may require a TOC reference before relocations are resolved.
-CFLAGS-rtld.c += -mno-vsx
-endif
-
-ifeq ($(subdir),string)
-sysdep_routines += strstr-ppc64
-CFLAGS-strncase.c += -funroll-loops
-CFLAGS-strncase_l.c += -funroll-loops
-endif
diff --git a/sysdeps/powerpc/powerpc64/power7/add_n.S b/sysdeps/powerpc/powerpc64/power7/add_n.S
deleted file mode 100644
index 6425afbc9f..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/add_n.S
+++ /dev/null
@@ -1,98 +0,0 @@
-/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
- subtraction.
- Copyright (C) 2003-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* cycles/limb
- * POWER7 2.18
- */
-
-#ifdef USE_AS_SUB
-# define FUNC __mpn_sub_n
-# define ADDSUBC subfe
-#else
-# define FUNC __mpn_add_n
-# define ADDSUBC adde
-#endif
-
-#define RP r3
-#define UP r4
-#define VP r5
-#define N r6
-
-EALIGN(FUNC, 5, 0)
-#ifdef USE_AS_SUB
- addic r0, r0, 0
-#else
- addic r0, r1, -1
-#endif
- andi. r7, N, 1
- beq L(bx0)
-
- ld r7, 0(UP)
- ld r9, r0(VP)
- ADDSUBC r11, r9, r7
- std r11, r0(RP)
- cmpldi N, N, 1
- beq N, L(end)
- addi UP, UP, 8
- addi VP, VP, 8
- addi RP, RP, 8
-
-L(bx0): addi r0, N, 2
- srdi r0, r0, 2
- mtctr r0
-
- andi. r7, N, 2
- bne L(mid)
-
- addi UP, UP, 16
- addi VP, VP, 16
- addi RP, RP, 16
-
- .align 5
-L(top): ld r6, -16(UP)
- ld r7, -8(UP)
- ld r8, -16(VP)
- ld r9, -8(VP)
- ADDSUBC r10, r8, N
- ADDSUBC r11, r9, r7
- std r10, -16(RP)
- std r11, -8(RP)
-L(mid): ld r6, 0(UP)
- ld r7, 8(UP)
- ld r8, 0(VP)
- ld r9, 8(VP)
- ADDSUBC r10, r8, N
- ADDSUBC r11, r9, r7
- std r10, 0(RP)
- std r11, 8(RP)
- addi UP, UP, 32
- addi VP, VP, 32
- addi RP, RP, 32
- bdnz L(top)
-
-L(end): subfe r3, r0, r0
-#ifdef USE_AS_SUB
- neg r3, r3
-#else
- addi r3, r3, 1
-#endif
- blr
-END(FUNC)
diff --git a/sysdeps/powerpc/powerpc64/power7/bcopy.c b/sysdeps/powerpc/powerpc64/power7/bcopy.c
deleted file mode 100644
index 4a6a400e7a..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/bcopy.c
+++ /dev/null
@@ -1 +0,0 @@
-/* Implemented at memmove.S */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
deleted file mode 100644
index 30fa17646e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
deleted file mode 100644
index 410d289a6d..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power6/fpu/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
deleted file mode 100644
index 9ccc758c9e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/* finite(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-/* int __finite(x) */
- .section ".toc","aw"
-.LC0: /* 1.0 */
- .tc FD_ONE[TC],0x3ff0000000000000
- .section ".text"
- .type __finite, @function
- .machine power7
-EALIGN (__finite, 4, 0)
- CALL_MCOUNT 0
- lfd fp0,.LC0@toc(r2)
- ftdiv cr7,fp1,fp0
- li r3,1
- bflr 30
-
- /* If we are here, we either have +/-INF,
- NaN or denormal. */
-
- stfd fp1,-16(r1) /* Transfer FP to GPR's. */
- ori 2,2,0 /* Force a new dispatch group. */
- lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
- (biased exponent and sign bit). */
- clrlwi r4,r4,17 /* r4 = abs(r4). */
- cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
- bltlr cr7 /* LT means finite, other non-finite. */
- li r3,0
- blr
- END (__finite)
-
-hidden_def (__finite)
-weak_alias (__finite, finite)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__finite, __finitef)
-hidden_def (__finitef)
-weak_alias (__finitef, finitef)
-
-#if IS_IN (libm)
-# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
-compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
-compat_symbol (libm, finite, finitel, GLIBC_2_0)
-# endif
-#else
-# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
-compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
-compat_symbol (libc, finite, finitel, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
deleted file mode 100644
index 54bd94176d..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_finite.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
deleted file mode 100644
index 4482cddcfa..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
+++ /dev/null
@@ -1,69 +0,0 @@
-/* isinf(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-/* int __isinf(x) */
- .section ".toc","aw"
-.LC0: /* 1.0 */
- .tc FD_ONE[TC],0x3ff0000000000000
- .section ".text"
- .type __isinf, @function
- .machine power7
-EALIGN (__isinf, 4, 0)
- CALL_MCOUNT 0
- lfd fp0,.LC0@toc(r2)
- ftdiv cr7,fp1,fp0
- li r3,0
- bflr 29 /* If not INF, return. */
-
- /* Either we have -INF/+INF or a denormal. */
-
- stfd fp1,-16(r1) /* Transfer FP to GPR's. */
- ori 2,2,0 /* Force a new dispatch group. */
- lhz r4,-16+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
- (biased exponent and sign bit). */
- cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
- li r3,1
- beqlr cr7 /* EQ means INF, otherwise -INF. */
- li r3,-1
- blr
- END (__isinf)
-
-hidden_def (__isinf)
-weak_alias (__isinf, isinf)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__isinf, __isinff)
-hidden_def (__isinff)
-weak_alias (__isinff, isinff)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__isinf, __isinfl)
-weak_alias (__isinf, isinfl)
-#endif
-
-#if !IS_IN (libm)
-# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
-compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
-compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
deleted file mode 100644
index be759e091e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_isinf.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
deleted file mode 100644
index 46b08a0d37..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
+++ /dev/null
@@ -1,68 +0,0 @@
-/* isnan(). PowerPC64/POWER7 version.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <math_ldbl_opt.h>
-
-/* int __isnan(x) */
- .section ".toc","aw"
-.LC0: /* 1.0 */
- .tc FD_ONE[TC],0x3ff0000000000000
- .section ".text"
- .type __isnan, @function
- .machine power7
-EALIGN (__isnan, 4, 0)
- CALL_MCOUNT 0
- lfd fp0,.LC0@toc(r2)
- ftdiv cr7,fp1,fp0
- li r3,0
- bflr 30 /* If not NaN, finish. */
-
- stfd fp1,-16(r1) /* Transfer FP to GPR's. */
- ori 2,2,0 /* Force a new dispatch group. */
- ld r4,-16(r1) /* Load FP into GPR. */
- lis r0,0x7ff0
- sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */
- clrldi r4,r4,1 /* x = fabs(x) */
- cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */
- blelr cr7 /* LE means not NaN. */
- li r3,1 /* else return 1 */
- blr
- END (__isnan)
-
-hidden_def (__isnan)
-weak_alias (__isnan, isnan)
-
-/* It turns out that the 'double' version will also always work for
- single-precision. */
-strong_alias (__isnan, __isnanf)
-hidden_def (__isnanf)
-weak_alias (__isnanf, isnanf)
-
-#ifdef NO_LONG_DOUBLE
-strong_alias (__isnan, __isnanl)
-weak_alias (__isnan, isnanl)
-#endif
-
-#if !IS_IN (libm)
-# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
-compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
-compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
-# endif
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
deleted file mode 100644
index b48c85e0d3..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
+++ /dev/null
@@ -1 +0,0 @@
-/* This function uses the same code as s_isnan.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
deleted file mode 100644
index 2599c771d9..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/powerpc/power7/fpu/s_logb.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
deleted file mode 100644
index 7a5a8032e0..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/powerpc/power7/fpu/s_logbf.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c b/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
deleted file mode 100644
index 524ae2c78d..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/powerpc/power7/fpu/s_logbl.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S
deleted file mode 100644
index 5e9707aa02..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memchr.S
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5]) */
-
-#ifndef MEMCHR
-# define MEMCHR __memchr
-#endif
- .machine power7
-ENTRY (MEMCHR)
- CALL_MCOUNT 3
- dcbt 0,r3
- clrrdi r8,r3,3
- insrdi r4,r4,8,48
-
- /* Calculate the last acceptable address and check for possible
- addition overflow by using satured math:
- r7 = r3 + r5
- r7 |= -(r7 < x) */
- add r7,r3,r5
- subfc r6,r3,r7
- subfe r9,r9,r9
- extsw r6,r9
- or r7,r7,r6
-
- insrdi r4,r4,16,32
- cmpldi r5,32
- li r9, -1
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- insrdi r4,r4,32,0
- addi r7,r7,-1
-#ifdef __LITTLE_ENDIAN__
- sld r9,r9,r6
-#else
- srd r9,r9,r6
-#endif
- ble L(small_range)
-
- ld r12,0(r8) /* Load doubleword from memory. */
- cmpb r3,r12,r4 /* Check for BYTEs in DWORD1. */
- and r3,r3,r9
- clrldi r5,r7,61 /* Byte count - 1 in last dword. */
- clrrdi r7,r7,3 /* Address of last doubleword. */
- cmpldi cr7,r3,0 /* Does r3 indicate we got a hit? */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
- bt 28,L(loop_setup)
-
- /* Handle DWORD2 of pair. */
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr7,r3,0
- bne cr7,L(done)
-
-L(loop_setup):
- /* The last dword we want to read in the loop below is the one
- containing the last byte of the string, ie. the dword at
- (s + size - 1) & ~7, or r7. The first dword read is at
- r8 + 8, we read 2 * cnt dwords, so the last dword read will
- be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives
- cnt = (r7 - r8) / 16 */
- sub r6,r7,r8
- srdi r6,r6,4 /* Number of loop iterations. */
- mtctr r6 /* Setup the counter. */
-
- /* Main loop to look for BYTE in the string. Since
- it's a small loop (8 instructions), align it to 32-bytes. */
- .align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the byte-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r3,r12,r4
- cmpb r9,r11,r4
- or r6,r9,r3 /* Merge everything in one doubleword. */
- cmpldi cr7,r6,0
- bne cr7,L(found)
- bdnz L(loop)
-
- /* We may have one more dword to read. */
- cmpld r8,r7
- beqlr
-
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr6,r3,0
- bne cr6,L(done)
- blr
-
- .align 4
-L(found):
- /* OK, one (or both) of the doublewords contains BYTE. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains BYTE. */
- cmpldi cr6,r3,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* BYTE must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r3 so we can calculate the
- pointer. */
-
- mr r3,r9
- addi r8,r8,8
-
- /* r3 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as BYTE in the original
- doubleword from the string. Use that to calculate the pointer.
- We need to make sure BYTE is *before* the end of the range. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r3,-1
- andc r0,r0,r3
- popcntd r0,r0 /* Count trailing zeros. */
-#else
- cntlzd r0,r3 /* Count leading zeros before the match. */
-#endif
- cmpld r8,r7 /* Are we on the last dword? */
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r8,r0
- cmpld cr7,r0,r5 /* If on the last dword, check byte offset. */
- bnelr
- blelr cr7
- li r3,0
- blr
-
- .align 4
-L(null):
- li r3,0
- blr
-
-/* Deals with size <= 32. */
- .align 4
-L(small_range):
- cmpldi r5,0
- beq L(null)
- ld r12,0(r8) /* Load word from memory. */
- cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
- and r3,r3,r9
- cmpldi cr7,r3,0
- clrldi r5,r7,61 /* Byte count - 1 in last dword. */
- clrrdi r7,r7,3 /* Address of last doubleword. */
- cmpld r8,r7 /* Are we done already? */
- bne cr7,L(done)
- beqlr
-
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr6,r3,0
- cmpld r8,r7
- bne cr6,L(done) /* Found something. */
- beqlr /* Hit end of string (length). */
-
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr6,r3,0
- cmpld r8,r7
- bne cr6,L(done)
- beqlr
-
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr6,r3,0
- cmpld r8,r7
- bne cr6,L(done)
- beqlr
-
- ldu r12,8(r8)
- cmpb r3,r12,r4
- cmpldi cr6,r3,0
- bne cr6,L(done)
- blr
-
-END (MEMCHR)
-weak_alias (__memchr, memchr)
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
deleted file mode 100644
index 96ce8cee25..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ /dev/null
@@ -1,1061 +0,0 @@
-/* Optimized memcmp implementation for POWER7/PowerPC64.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] memcmp (const char *s1 [r3],
- const char *s2 [r4],
- size_t size [r5]) */
-#ifndef MEMCMP
-# define MEMCMP memcmp
-#endif
- .machine power7
-EALIGN (MEMCMP, 4, 0)
- CALL_MCOUNT 3
-
-#define rRTN r3
-#define rSTR1 r3 /* first string arg */
-#define rSTR2 r4 /* second string arg */
-#define rN r5 /* max string length */
-#define rWORD1 r6 /* current word in s1 */
-#define rWORD2 r7 /* current word in s2 */
-#define rWORD3 r8 /* next word in s1 */
-#define rWORD4 r9 /* next word in s2 */
-#define rWORD5 r10 /* next word in s1 */
-#define rWORD6 r11 /* next word in s2 */
-
-#define rOFF8 r20 /* 8 bytes offset. */
-#define rOFF16 r21 /* 16 bytes offset. */
-#define rOFF24 r22 /* 24 bytes offset. */
-#define rOFF32 r23 /* 24 bytes offset. */
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
-#define rSHR r28 /* Unaligned shift right count. */
-#define rSHL r29 /* Unaligned shift left count. */
-#define rWORD7 r30 /* next word in s1 */
-#define rWORD8 r31 /* next word in s2 */
-
-#define rWORD8SAVE (-8)
-#define rWORD7SAVE (-16)
-#define rOFF8SAVE (-24)
-#define rOFF16SAVE (-32)
-#define rOFF24SAVE (-40)
-#define rOFF32SAVE (-48)
-#define rSHRSAVE (-56)
-#define rSHLSAVE (-64)
-#define rWORD8SHIFTSAVE (-72)
-#define rWORD2SHIFTSAVE (-80)
-#define rWORD4SHIFTSAVE (-88)
-#define rWORD6SHIFTSAVE (-96)
-
-#ifdef __LITTLE_ENDIAN__
-# define LD ldbrx
-#else
-# define LD ldx
-#endif
-
- xor r0, rSTR2, rSTR1
- cmpldi cr6, rN, 0
- cmpldi cr1, rN, 12
- clrldi. r0, r0, 61
- clrldi r12, rSTR1, 61
- cmpldi cr5, r12, 0
- beq- cr6, L(zeroLength)
- dcbt 0, rSTR1
- dcbt 0, rSTR2
-/* If less than 8 bytes or not aligned, use the unaligned
- byte loop. */
- blt cr1, L(bytealigned)
- std rWORD8, rWORD8SAVE(r1)
- std rWORD7, rWORD7SAVE(r1)
- std rOFF8, rOFF8SAVE(r1)
- std rOFF16, rOFF16SAVE(r1)
- std rOFF24, rOFF24SAVE(r1)
- std rOFF32, rOFF32SAVE(r1)
- cfi_offset(rWORD8, rWORD8SAVE)
- cfi_offset(rWORD7, rWORD7SAVE)
- cfi_offset(rOFF8, rOFF8SAVE)
- cfi_offset(rOFF16, rOFF16SAVE)
- cfi_offset(rOFF24, rOFF24SAVE)
- cfi_offset(rOFF32, rOFF32SAVE)
-
- li rOFF8,8
- li rOFF16,16
- li rOFF24,24
- li rOFF32,32
-
- bne L(unaligned)
-/* At this point we know both strings have the same alignment and the
- compare length is at least 8 bytes. r12 contains the low order
- 3 bits of rSTR1 and cr5 contains the result of the logical compare
- of r12 to 0. If r12 == 0 then we are already double word
- aligned and can perform the DW aligned loop.
-
- Otherwise we know the two strings have the same alignment (but not
- yet DW). So we force the string addresses to the next lower DW
- boundary and special case this first DW using shift left to
- eliminate bits preceding the first byte. Since we want to join the
- normal (DW aligned) compare loop, starting at the second double word,
- we need to adjust the length (rN) and special case the loop
- versioning for the first DW. This ensures that the loop count is
- correct and the first DW (shifted) is in the expected register pair. */
- .align 4
-L(samealignment):
- clrrdi rSTR1, rSTR1, 3
- clrrdi rSTR2, rSTR2, 3
- beq cr5, L(DWaligned)
- add rN, rN, r12
- sldi rWORD6, r12, 3
- srdi r0, rN, 5 /* Divide by 32 */
- andi. r12, rN, 24 /* Get the DW remainder */
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- beq L(dPs4)
- mtctr r0
- bgt cr1, L(dPs3)
- beq cr1, L(dPs2)
-
-/* Remainder is 8 */
- .align 3
-L(dsP1):
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD2, rWORD6
- cmpld cr5, rWORD5, rWORD6
- blt cr7, L(dP1x)
-/* Do something useful in this cycle since we have to branch anyway. */
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
- b L(dP1e)
-/* Remainder is 16 */
- .align 4
-L(dPs2):
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD2, rWORD6
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP2x)
-/* Do something useful in this cycle since we have to branch anyway. */
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
- b L(dP2e)
-/* Remainder is 24 */
- .align 4
-L(dPs3):
- sld rWORD3, rWORD1, rWORD6
- sld rWORD4, rWORD2, rWORD6
- cmpld cr1, rWORD3, rWORD4
- b L(dP3e)
-/* Count is a multiple of 32, remainder is 0 */
- .align 4
-L(dPs4):
- mtctr r0
- sld rWORD1, rWORD1, rWORD6
- sld rWORD2, rWORD2, rWORD6
- cmpld cr7, rWORD1, rWORD2
- b L(dP4e)
-
-/* At this point we know both strings are double word aligned and the
- compare length is at least 8 bytes. */
- .align 4
-L(DWaligned):
- andi. r12, rN, 24 /* Get the DW remainder */
- srdi r0, rN, 5 /* Divide by 32 */
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- beq L(dP4)
- bgt cr1, L(dP3)
- beq cr1, L(dP2)
-
-/* Remainder is 8 */
- .align 4
-L(dP1):
- mtctr r0
-/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
- (8-15 byte compare), we want to use only volatile registers. This
- means we can avoid restoring non-volatile registers since we did not
- change any on the early exit path. The key here is the non-early
- exit path only cares about the condition code (cr5), not about which
- register pair was used. */
- LD rWORD5, 0, rSTR1
- LD rWORD6, 0, rSTR2
- cmpld cr5, rWORD5, rWORD6
- blt cr7, L(dP1x)
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
-L(dP1e):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr1, rWORD3, rWORD4
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5x)
- bne cr7, L(dLcr7x)
-
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- bne cr1, L(dLcr1)
- cmpld cr5, rWORD7, rWORD8
- bdnz L(dLoop)
- bne cr6, L(dLcr6)
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- .align 3
-L(dP1x):
- sldi. r12, rN, 3
- bne cr5, L(dLcr5x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Remainder is 16 */
- .align 4
-L(dP2):
- mtctr r0
- LD rWORD5, 0, rSTR1
- LD rWORD6, 0, rSTR2
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP2x)
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
-L(dP2e):
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- LD rWORD3, rOFF24, rSTR1
- LD rWORD4, rOFF24, rSTR2
- cmpld cr1, rWORD3, rWORD4
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(dLcr6)
- bne cr5, L(dLcr5)
- b L(dLoop2)
- .align 4
-L(dP2x):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- sldi. r12, rN, 3
- bne cr6, L(dLcr6x)
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr1, L(dLcr1x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Remainder is 24 */
- .align 4
-L(dP3):
- mtctr r0
- LD rWORD3, 0, rSTR1
- LD rWORD4, 0, rSTR2
- cmpld cr1, rWORD3, rWORD4
-L(dP3e):
- LD rWORD5, rOFF8, rSTR1
- LD rWORD6, rOFF8, rSTR2
- cmpld cr6, rWORD5, rWORD6
- blt cr7, L(dP3x)
- LD rWORD7, rOFF16, rSTR1
- LD rWORD8, rOFF16, rSTR2
- cmpld cr5, rWORD7, rWORD8
- LD rWORD1, rOFF24, rSTR1
- LD rWORD2, rOFF24, rSTR2
- cmpld cr7, rWORD1, rWORD2
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- bne cr1, L(dLcr1)
- bne cr6, L(dLcr6)
- b L(dLoop1)
-/* Again we are on a early exit path (24-31 byte compare), we want to
- only use volatile registers and avoid restoring non-volatile
- registers. */
- .align 4
-L(dP3x):
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- sldi. r12, rN, 3
- bne cr1, L(dLcr1x)
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- bne cr6, L(dLcr6x)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- bne cr7, L(dLcr7x)
- bne L(d00)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-/* Count is a multiple of 32, remainder is 0 */
- .align 4
-L(dP4):
- mtctr r0
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpld cr7, rWORD1, rWORD2
-L(dP4e):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- LD rWORD5, rOFF16, rSTR1
- LD rWORD6, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- LD rWORD7, rOFF24, rSTR1
- LD rWORD8, rOFF24, rSTR2
- addi rSTR1, rSTR1, 24
- addi rSTR2, rSTR2, 24
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(dLcr7)
- bne cr1, L(dLcr1)
- bdz- L(d24) /* Adjust CTR as we start with +4 */
-/* This is the primary loop */
- .align 4
-L(dLoop):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(dLcr6)
-L(dLoop1):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5)
-L(dLoop2):
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(dLcr7)
-L(dLoop3):
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- bne cr1, L(dLcr1)
- cmpld cr7, rWORD1, rWORD2
- bdnz L(dLoop)
-
-L(dL4):
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(dLcr6)
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5)
- cmpld cr5, rWORD7, rWORD8
-L(d44):
- bne cr7, L(dLcr7)
-L(d34):
- bne cr1, L(dLcr1)
-L(d24):
- bne cr6, L(dLcr6)
-L(d14):
- sldi. r12, rN, 3
- bne cr5, L(dLcr5)
-L(d04):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- beq L(duzeroLength)
-/* At this point we have a remainder of 1 to 7 bytes to compare. Since
- we are aligned it is safe to load the whole double word, and use
- shift right double to eliminate bits beyond the compare length. */
-L(d00):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- srd rWORD1, rWORD1, rN
- srd rWORD2, rWORD2, rN
- cmpld cr7, rWORD1, rWORD2
- bne cr7, L(dLcr7x)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
- .align 4
-L(dLcr7):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr7x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr7
- li rRTN, -1
- blr
- .align 4
-L(dLcr1):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr1x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr1
- li rRTN, -1
- blr
- .align 4
-L(dLcr6):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr6x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
- .align 4
-L(dLcr5):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dLcr5x):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 1
- bgtlr cr5
- li rRTN, -1
- blr
-
- .align 4
-L(bytealigned):
- mtctr rN
-
-/* We need to prime this loop. This loop is swing modulo scheduled
- to avoid pipe delays. The dependent instruction latencies (load to
- compare to conditional branch) is 2 to 3 cycles. In this loop each
- dispatch group ends in a branch and takes 1 cycle. Effectively
- the first iteration of the loop only serves to load operands and
- branches based on compares are delayed until the next loop.
-
- So we must precondition some registers and condition codes so that
- we don't exit the loop early on the first iteration. */
-
- lbz rWORD1, 0(rSTR1)
- lbz rWORD2, 0(rSTR2)
- bdz L(b11)
- cmpld cr7, rWORD1, rWORD2
- lbz rWORD3, 1(rSTR1)
- lbz rWORD4, 1(rSTR2)
- bdz L(b12)
- cmpld cr1, rWORD3, rWORD4
- lbzu rWORD5, 2(rSTR1)
- lbzu rWORD6, 2(rSTR2)
- bdz L(b13)
- .align 4
-L(bLoop):
- lbzu rWORD1, 1(rSTR1)
- lbzu rWORD2, 1(rSTR2)
- bne cr7, L(bLcr7)
-
- cmpld cr6, rWORD5, rWORD6
- bdz L(b3i)
-
- lbzu rWORD3, 1(rSTR1)
- lbzu rWORD4, 1(rSTR2)
- bne cr1, L(bLcr1)
-
- cmpld cr7, rWORD1, rWORD2
- bdz L(b2i)
-
- lbzu rWORD5, 1(rSTR1)
- lbzu rWORD6, 1(rSTR2)
- bne cr6, L(bLcr6)
-
- cmpld cr1, rWORD3, rWORD4
- bdnz L(bLoop)
-
-/* We speculatively loading bytes before we have tested the previous
- bytes. But we must avoid overrunning the length (in the ctr) to
- prevent these speculative loads from causing a segfault. In this
- case the loop will exit early (before the all pending bytes are
- tested. In this case we must complete the pending operations
- before returning. */
-L(b1i):
- bne cr7, L(bLcr7)
- bne cr1, L(bLcr1)
- b L(bx56)
- .align 4
-L(b2i):
- bne cr6, L(bLcr6)
- bne cr7, L(bLcr7)
- b L(bx34)
- .align 4
-L(b3i):
- bne cr1, L(bLcr1)
- bne cr6, L(bLcr6)
- b L(bx12)
- .align 4
-L(bLcr7):
- li rRTN, 1
- bgtlr cr7
- li rRTN, -1
- blr
-L(bLcr1):
- li rRTN, 1
- bgtlr cr1
- li rRTN, -1
- blr
-L(bLcr6):
- li rRTN, 1
- bgtlr cr6
- li rRTN, -1
- blr
-
-L(b13):
- bne cr7, L(bx12)
- bne cr1, L(bx34)
-L(bx56):
- sub rRTN, rWORD5, rWORD6
- blr
- nop
-L(b12):
- bne cr7, L(bx12)
-L(bx34):
- sub rRTN, rWORD3, rWORD4
- blr
-L(b11):
-L(bx12):
- sub rRTN, rWORD1, rWORD2
- blr
-
- .align 4
-L(zeroLength):
- li rRTN, 0
- blr
-
- .align 4
-/* At this point we know the strings have different alignment and the
- compare length is at least 8 bytes. r12 contains the low order
- 3 bits of rSTR1 and cr5 contains the result of the logical compare
- of r12 to 0. If r12 == 0 then rStr1 is double word
- aligned and can perform the DWunaligned loop.
-
- Otherwise we know that rSTR1 is not already DW aligned yet.
- So we can force the string addresses to the next lower DW
- boundary and special case this first DW using shift left to
- eliminate bits preceding the first byte. Since we want to join the
- normal (DWaligned) compare loop, starting at the second double word,
- we need to adjust the length (rN) and special case the loop
- versioning for the first DW. This ensures that the loop count is
- correct and the first DW (shifted) is in the expected resister pair. */
-L(unaligned):
- std rSHL, rSHLSAVE(r1)
- cfi_offset(rSHL, rSHLSAVE)
- clrldi rSHL, rSTR2, 61
- beq cr6, L(duzeroLength)
- std rSHR, rSHRSAVE(r1)
- cfi_offset(rSHR, rSHRSAVE)
- beq cr5, L(DWunaligned)
- std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
-/* Adjust the logical start of rSTR2 to compensate for the extra bits
- in the 1st rSTR1 DW. */
- sub rWORD8_SHIFT, rSTR2, r12
-/* But do not attempt to address the DW before that DW that contains
- the actual start of rSTR2. */
- clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
-/* Compute the left/right shift counts for the unaligned rSTR2,
- compensating for the logical (DW aligned) start of rSTR1. */
- clrldi rSHL, rWORD8_SHIFT, 61
- clrrdi rSTR1, rSTR1, 3
- std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- sldi rSHL, rSHL, 3
- cmpld cr5, rWORD8_SHIFT, rSTR2
- add rN, rN, r12
- sldi rWORD6, r12, 3
- std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
- cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
- cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
- subfic rSHR, rSHL, 64
- srdi r0, rN, 5 /* Divide by 32 */
- andi. r12, rN, 24 /* Get the DW remainder */
-/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
- this special case those bits may be discarded anyway. Also we
- must avoid loading a DW where none of the bits are part of rSTR2 as
- this may cross a page boundary and cause a page fault. */
- li rWORD8, 0
- blt cr5, L(dus0)
- LD rWORD8, 0, rSTR2
- addi rSTR2, rSTR2, 8
- sld rWORD8, rWORD8, rSHL
-
-L(dus0):
- LD rWORD1, 0, rSTR1
- LD rWORD2, 0, rSTR2
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- srd r12, rWORD2, rSHR
- clrldi rN, rN, 61
- beq L(duPs4)
- mtctr r0
- or rWORD8, r12, rWORD8
- bgt cr1, L(duPs3)
- beq cr1, L(duPs2)
-
-/* Remainder is 8 */
- .align 4
-L(dusP1):
- sld rWORD8_SHIFT, rWORD2, rSHL
- sld rWORD7, rWORD1, rWORD6
- sld rWORD8, rWORD8, rWORD6
- bge cr7, L(duP1e)
-/* At this point we exit early with the first double word compare
- complete and remainder of 0 to 7 bytes. See L(du14) for details on
- how we handle the remaining bytes. */
- cmpld cr5, rWORD7, rWORD8
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-/* Remainder is 16 */
- .align 4
-L(duPs2):
- sld rWORD6_SHIFT, rWORD2, rSHL
- sld rWORD5, rWORD1, rWORD6
- sld rWORD6, rWORD8, rWORD6
- b L(duP2e)
-/* Remainder is 24 */
- .align 4
-L(duPs3):
- sld rWORD4_SHIFT, rWORD2, rSHL
- sld rWORD3, rWORD1, rWORD6
- sld rWORD4, rWORD8, rWORD6
- b L(duP3e)
-/* Count is a multiple of 32, remainder is 0 */
- .align 4
-L(duPs4):
- mtctr r0
- or rWORD8, r12, rWORD8
- sld rWORD2_SHIFT, rWORD2, rSHL
- sld rWORD1, rWORD1, rWORD6
- sld rWORD2, rWORD8, rWORD6
- b L(duP4e)
-
-/* At this point we know rSTR1 is double word aligned and the
- compare length is at least 8 bytes. */
- .align 4
-L(DWunaligned):
- std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- srdi r0, rN, 5 /* Divide by 32 */
- std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- andi. r12, rN, 24 /* Get the DW remainder */
- std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
- cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
- cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
- cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
- sldi rSHL, rSHL, 3
- LD rWORD6, 0, rSTR2
- LD rWORD8, rOFF8, rSTR2
- addi rSTR2, rSTR2, 8
- cmpldi cr1, r12, 16
- cmpldi cr7, rN, 32
- clrldi rN, rN, 61
- subfic rSHR, rSHL, 64
- sld rWORD6_SHIFT, rWORD6, rSHL
- beq L(duP4)
- mtctr r0
- bgt cr1, L(duP3)
- beq cr1, L(duP2)
-
-/* Remainder is 8 */
- .align 4
-L(duP1):
- srd r12, rWORD8, rSHR
- LD rWORD7, 0, rSTR1
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP1x)
-L(duP1e):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr5, rWORD7, rWORD8
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr7, rWORD1, rWORD2
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- bne cr5, L(duLcr5)
- or rWORD4, r12, rWORD2_SHIFT
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr1, rWORD3, rWORD4
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- bne cr7, L(duLcr7)
- or rWORD6, r0, rWORD4_SHIFT
- cmpld cr6, rWORD5, rWORD6
- b L(duLoop3)
- .align 4
-/* At this point we exit early with the first double word compare
- complete and remainder of 0 to 7 bytes. See L(du14) for details on
- how we handle the remaining bytes. */
-L(duP1x):
- cmpld cr5, rWORD7, rWORD8
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-/* Remainder is 16 */
- .align 4
-L(duP2):
- srd r0, rWORD8, rSHR
- LD rWORD5, 0, rSTR1
- or rWORD6, r0, rWORD6_SHIFT
- sld rWORD6_SHIFT, rWORD8, rSHL
-L(duP2e):
- LD rWORD7, rOFF8, rSTR1
- LD rWORD8, rOFF8, rSTR2
- cmpld cr6, rWORD5, rWORD6
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP2x)
- LD rWORD1, rOFF16, rSTR1
- LD rWORD2, rOFF16, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- LD rWORD3, rOFF24, rSTR1
- LD rWORD4, rOFF24, rSTR2
- cmpld cr7, rWORD1, rWORD2
- bne cr5, L(duLcr5)
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- cmpld cr1, rWORD3, rWORD4
- b L(duLoop2)
- .align 4
-L(duP2x):
- cmpld cr5, rWORD7, rWORD8
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
- bne cr6, L(duLcr6)
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-
-/* Remainder is 24 */
- .align 4
-L(duP3):
- srd r12, rWORD8, rSHR
- LD rWORD3, 0, rSTR1
- sld rWORD4_SHIFT, rWORD8, rSHL
- or rWORD4, r12, rWORD6_SHIFT
-L(duP3e):
- LD rWORD5, rOFF8, rSTR1
- LD rWORD6, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
- LD rWORD7, rOFF16, rSTR1
- LD rWORD8, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- blt cr7, L(duP3x)
- LD rWORD1, rOFF24, rSTR1
- LD rWORD2, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- cmpld cr7, rWORD1, rWORD2
- b L(duLoop1)
- .align 4
-L(duP3x):
- addi rSTR1, rSTR1, 16
- addi rSTR2, rSTR2, 16
- cmpld cr5, rWORD7, rWORD8
- bne cr6, L(duLcr6)
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- b L(dutrim)
-
-/* Count is a multiple of 32, remainder is 0 */
- .align 4
-L(duP4):
- mtctr r0
- srd r0, rWORD8, rSHR
- LD rWORD1, 0, rSTR1
- sld rWORD2_SHIFT, rWORD8, rSHL
- or rWORD2, r0, rWORD6_SHIFT
-L(duP4e):
- LD rWORD3, rOFF8, rSTR1
- LD rWORD4, rOFF8, rSTR2
- cmpld cr7, rWORD1, rWORD2
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
- LD rWORD5, rOFF16, rSTR1
- LD rWORD6, rOFF16, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr7, L(duLcr7)
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
- LD rWORD7, rOFF24, rSTR1
- LD rWORD8, rOFF24, rSTR2
- addi rSTR1, rSTR1, 24
- addi rSTR2, rSTR2, 24
- cmpld cr6, rWORD5, rWORD6
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- cmpld cr5, rWORD7, rWORD8
- bdz L(du24) /* Adjust CTR as we start with +4 */
-/* This is the primary loop */
- .align 4
-L(duLoop):
- LD rWORD1, rOFF8, rSTR1
- LD rWORD2, rOFF8, rSTR2
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(duLcr6)
- srd r0, rWORD2, rSHR
- sld rWORD2_SHIFT, rWORD2, rSHL
- or rWORD2, r0, rWORD8_SHIFT
-L(duLoop1):
- LD rWORD3, rOFF16, rSTR1
- LD rWORD4, rOFF16, rSTR2
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(duLcr5)
- srd r12, rWORD4, rSHR
- sld rWORD4_SHIFT, rWORD4, rSHL
- or rWORD4, r12, rWORD2_SHIFT
-L(duLoop2):
- LD rWORD5, rOFF24, rSTR1
- LD rWORD6, rOFF24, rSTR2
- cmpld cr5, rWORD7, rWORD8
- bne cr7, L(duLcr7)
- srd r0, rWORD6, rSHR
- sld rWORD6_SHIFT, rWORD6, rSHL
- or rWORD6, r0, rWORD4_SHIFT
-L(duLoop3):
- LD rWORD7, rOFF32, rSTR1
- LD rWORD8, rOFF32, rSTR2
- addi rSTR1, rSTR1, 32
- addi rSTR2, rSTR2, 32
- cmpld cr7, rWORD1, rWORD2
- bne cr1, L(duLcr1)
- srd r12, rWORD8, rSHR
- sld rWORD8_SHIFT, rWORD8, rSHL
- or rWORD8, r12, rWORD6_SHIFT
- bdnz L(duLoop)
-
-L(duL4):
- cmpld cr1, rWORD3, rWORD4
- bne cr6, L(duLcr6)
- cmpld cr6, rWORD5, rWORD6
- bne cr5, L(duLcr5)
- cmpld cr5, rWORD7, rWORD8
-L(du44):
- bne cr7, L(duLcr7)
-L(du34):
- bne cr1, L(duLcr1)
-L(du24):
- bne cr6, L(duLcr6)
-L(du14):
- sldi. rN, rN, 3
- bne cr5, L(duLcr5)
-/* At this point we have a remainder of 1 to 7 bytes to compare. We use
- shift right double to eliminate bits beyond the compare length.
-
- However it may not be safe to load rWORD2 which may be beyond the
- string length. So we compare the bit length of the remainder to
- the right shift count (rSHR). If the bit count is less than or equal
- we do not need to load rWORD2 (all significant bits are already in
- rWORD8_SHIFT). */
- cmpld cr7, rN, rSHR
- beq L(duZeroReturn)
- li r0, 0
- ble cr7, L(dutrim)
- LD rWORD2, rOFF8, rSTR2
- srd r0, rWORD2, rSHR
- .align 4
-L(dutrim):
- LD rWORD1, rOFF8, rSTR1
- ld rWORD8, -8(r1)
- subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
- or rWORD2, r0, rWORD8_SHIFT
- ld rWORD7, rWORD7SAVE(r1)
- ld rSHL, rSHLSAVE(r1)
- srd rWORD1, rWORD1, rN
- srd rWORD2, rWORD2, rN
- ld rSHR, rSHRSAVE(r1)
- ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- li rRTN, 0
- cmpld cr7, rWORD1, rWORD2
- ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
- beq cr7, L(dureturn24)
- li rRTN, 1
- ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- bgtlr cr7
- li rRTN, -1
- blr
- .align 4
-L(duLcr7):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr7, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr1):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr1, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr6):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr6, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
- .align 4
-L(duLcr5):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
- li rRTN, 1
- bgt cr5, L(dureturn29)
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
- li rRTN, -1
- b L(dureturn27)
-
- .align 3
-L(duZeroReturn):
- li rRTN, 0
- .align 4
-L(dureturn):
- ld rWORD8, rWORD8SAVE(r1)
- ld rWORD7, rWORD7SAVE(r1)
-L(dureturn29):
- ld rSHL, rSHLSAVE(r1)
- ld rSHR, rSHRSAVE(r1)
-L(dureturn27):
- ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
- ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
- ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
-L(dureturn24):
- ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- blr
-
-L(duzeroLength):
- ld rOFF8, rOFF8SAVE(r1)
- ld rOFF16, rOFF16SAVE(r1)
- ld rOFF24, rOFF24SAVE(r1)
- ld rOFF32, rOFF32SAVE(r1)
- li rRTN, 0
- blr
-
-END (MEMCMP)
-libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
deleted file mode 100644
index e08993cbc3..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Optimized memcpy implementation for PowerPC64/POWER7.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
-/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
- Returns 'dst'. */
-
-#ifndef MEMCPY
-# define MEMCPY memcpy
-#endif
-
-#define dst 11 /* Use r11 so r3 kept unchanged. */
-#define src 4
-#define cnt 5
-
- .machine power7
-EALIGN (MEMCPY, 5, 0)
- CALL_MCOUNT 3
-
- cmpldi cr1,cnt,31
- neg 0,3
- ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
- code. */
-
-/* Align copies using VSX instructions to quadword. It is to avoid alignment
- traps when memcpy is used on non-cacheable memory (for instance, memory
- mapped I/O). */
- andi. 10,3,15
- clrldi 11,4,60
- cmpld cr6,10,11 /* SRC and DST alignments match? */
-
- mr dst,3
- bne cr6,L(copy_GE_32_unaligned)
- beq L(aligned_copy)
-
- mtocrf 0x01,0
- clrldi 0,0,60
-
-/* Get the DST and SRC aligned to 16 bytes. */
-1:
- bf 31,2f
- lbz 6,0(src)
- addi src,src,1
- stb 6,0(dst)
- addi dst,dst,1
-2:
- bf 30,4f
- lhz 6,0(src)
- addi src,src,2
- sth 6,0(dst)
- addi dst,dst,2
-4:
- bf 29,8f
- lwz 6,0(src)
- addi src,src,4
- stw 6,0(dst)
- addi dst,dst,4
-8:
- bf 28,16f
- ld 6,0(src)
- addi src,src,8
- std 6,0(dst)
- addi dst,dst,8
-16:
- subf cnt,0,cnt
-
-/* Main aligned copy loop. Copies 128 bytes at a time. */
-L(aligned_copy):
- li 6,16
- li 7,32
- li 8,48
- mtocrf 0x02,cnt
- srdi 12,cnt,7
- cmpdi 12,0
- beq L(aligned_tail)
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- mtctr 12
- b L(aligned_128loop)
-
- .align 4
-L(aligned_128head):
- /* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,src
- lxvd2x 7,src,6
-L(aligned_128loop):
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- stxvd2x 6,0,dst
- addi src,src,64
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- addi dst,dst,64
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- addi dst,dst,64
- bdnz L(aligned_128head)
-
-L(aligned_tail):
- mtocrf 0x01,cnt
- bf 25,32f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- addi dst,dst,64
-32:
- bf 26,16f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- addi src,src,32
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- addi dst,dst,32
-16:
- bf 27,8f
- lxvd2x 6,0,src
- addi src,src,16
- stxvd2x 6,0,dst
- addi dst,dst,16
-8:
- bf 28,4f
- ld 6,0(src)
- addi src,src,8
- std 6,0(dst)
- addi dst,dst,8
-4: /* Copies 4~7 bytes. */
- bf 29,L(tail2)
- lwz 6,0(src)
- stw 6,0(dst)
- bf 30,L(tail5)
- lhz 7,4(src)
- sth 7,4(dst)
- bflr 31
- lbz 8,6(src)
- stb 8,6(dst)
- /* Return original DST pointer. */
- blr
-
-
-/* Handle copies of 0~31 bytes. */
- .align 4
-L(copy_LT_32):
- mr dst,3
- cmpldi cr6,cnt,8
- mtocrf 0x01,cnt
- ble cr6,L(copy_LE_8)
-
- /* At least 9 bytes to go. */
- neg 8,4
- andi. 0,8,3
- cmpldi cr1,cnt,16
- beq L(copy_LT_32_aligned)
-
- /* Force 4-byte alignment for SRC. */
- mtocrf 0x01,0
- subf cnt,0,cnt
-2:
- bf 30,1f
- lhz 6,0(src)
- addi src,src,2
- sth 6,0(dst)
- addi dst,dst,2
-1:
- bf 31,L(end_4bytes_alignment)
- lbz 6,0(src)
- addi src,src,1
- stb 6,0(dst)
- addi dst,dst,1
-
- .align 4
-L(end_4bytes_alignment):
- cmpldi cr1,cnt,16
- mtocrf 0x01,cnt
-
-L(copy_LT_32_aligned):
- /* At least 6 bytes to go, and SRC is word-aligned. */
- blt cr1,8f
-
- /* Copy 16 bytes. */
- lwz 6,0(src)
- lwz 7,4(src)
- stw 6,0(dst)
- lwz 8,8(src)
- stw 7,4(dst)
- lwz 6,12(src)
- addi src,src,16
- stw 8,8(dst)
- stw 6,12(dst)
- addi dst,dst,16
-8: /* Copy 8 bytes. */
- bf 28,L(tail4)
- lwz 6,0(src)
- lwz 7,4(src)
- addi src,src,8
- stw 6,0(dst)
- stw 7,4(dst)
- addi dst,dst,8
-
- .align 4
-/* Copies 4~7 bytes. */
-L(tail4):
- bf 29,L(tail2)
- lwz 6,0(src)
- stw 6,0(dst)
- bf 30,L(tail5)
- lhz 7,4(src)
- sth 7,4(dst)
- bflr 31
- lbz 8,6(src)
- stb 8,6(dst)
- /* Return original DST pointer. */
- blr
-
- .align 4
-/* Copies 2~3 bytes. */
-L(tail2):
- bf 30,1f
- lhz 6,0(src)
- sth 6,0(dst)
- bflr 31
- lbz 7,2(src)
- stb 7,2(dst)
- blr
-
- .align 4
-L(tail5):
- bflr 31
- lbz 6,4(src)
- stb 6,4(dst)
- blr
-
- .align 4
-1:
- bflr 31
- lbz 6,0(src)
- stb 6,0(dst)
- /* Return original DST pointer. */
- blr
-
-
-/* Handles copies of 0~8 bytes. */
- .align 4
-L(copy_LE_8):
- bne cr6,L(tail4)
-
- /* Though we could've used ld/std here, they are still
- slow for unaligned cases. */
-
- lwz 6,0(src)
- lwz 7,4(src)
- stw 6,0(dst)
- stw 7,4(dst)
- blr
-
-
-/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
- SRC is not. Use aligned quadword loads from SRC, shifted to realign
- the data, allowing for aligned DST stores. */
- .align 4
-L(copy_GE_32_unaligned):
- clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
- srdi 9,cnt,4 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_unaligned_cont)
-
- /* DST is not quadword aligned, get it aligned. */
-
- mtocrf 0x01,0
- subf cnt,0,cnt
-
- /* Vector instructions work best when proper alignment (16-bytes)
- is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
-1:
- bf 31,2f
- lbz 6,0(src)
- addi src,src,1
- stb 6,0(dst)
- addi dst,dst,1
-2:
- bf 30,4f
- lhz 6,0(src)
- addi src,src,2
- sth 6,0(dst)
- addi dst,dst,2
-4:
- bf 29,8f
- lwz 6,0(src)
- addi src,src,4
- stw 6,0(dst)
- addi dst,dst,4
-8:
- bf 28,0f
- ld 6,0(src)
- addi src,src,8
- std 6,0(dst)
- addi dst,dst,8
-0:
- srdi 9,cnt,4 /* Number of full quadwords remaining. */
-
- /* The proper alignment is present, it is OK to copy the bytes now. */
-L(copy_GE_32_unaligned_cont):
-
- /* Setup two indexes to speed up the indexed vector operations. */
- clrldi 10,cnt,60
- li 6,16 /* Index for 16-bytes offsets. */
- li 7,32 /* Index for 32-bytes offsets. */
- cmpldi cr1,10,0
- srdi 8,cnt,5 /* Setup the loop counter. */
- mtocrf 0x01,9
- cmpldi cr6,9,1
-#ifdef __LITTLE_ENDIAN__
- lvsr 5,0,src
-#else
- lvsl 5,0,src
-#endif
- lvx 3,0,src
- li 0,0
- bf 31,L(setup_unaligned_loop)
-
- /* Copy another 16 bytes to align to 32-bytes due to the loop. */
- lvx 4,src,6
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- addi src,src,16
- stvx 6,0,dst
- addi dst,dst,16
- vor 3,4,4
- clrrdi 0,src,60
-
-L(setup_unaligned_loop):
- mtctr 8
- ble cr6,L(end_unaligned_loop)
-
- /* Copy 32 bytes at a time using vector instructions. */
- .align 4
-L(unaligned_loop):
-
- /* Note: vr6/vr10 may contain data that was already copied,
- but in order to get proper alignment, we may have to copy
- some portions again. This is faster than having unaligned
- vector instructions though. */
-
- lvx 4,src,6
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- lvx 3,src,7
-#ifdef __LITTLE_ENDIAN__
- vperm 10,3,4,5
-#else
- vperm 10,4,3,5
-#endif
- addi src,src,32
- stvx 6,0,dst
- stvx 10,dst,6
- addi dst,dst,32
- bdnz L(unaligned_loop)
-
- clrrdi 0,src,60
-
- .align 4
-L(end_unaligned_loop):
-
- /* Check for tail bytes. */
- mtocrf 0x01,cnt
- beqlr cr1
-
- add src,src,0
-
- /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
- /* Copy 8 bytes. */
- bf 28,4f
- lwz 6,0(src)
- lwz 7,4(src)
- addi src,src,8
- stw 6,0(dst)
- stw 7,4(dst)
- addi dst,dst,8
-4: /* Copy 4~7 bytes. */
- bf 29,L(tail2)
- lwz 6,0(src)
- stw 6,0(dst)
- bf 30,L(tail5)
- lhz 7,4(src)
- sth 7,4(dst)
- bflr 31
- lbz 8,6(src)
- stb 8,6(dst)
- /* Return original DST pointer. */
- blr
-
-END_GEN_TB (MEMCPY,TB_TOCLESS)
-libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
deleted file mode 100644
index 4c0f7c3571..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ /dev/null
@@ -1,835 +0,0 @@
-/* Optimized memmove implementation for PowerPC64/POWER7.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
-/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
-
- This optimization check if memory 'dest' overlaps with 'src'. If it does
- not then it calls an optimized memcpy call (similar to memcpy for POWER7,
- embedded here to gain some cycles).
- If source and destiny overlaps, a optimized backwards memcpy is used
- instead. */
-
-#ifndef MEMMOVE
-# define MEMMOVE memmove
-#endif
- .machine power7
-EALIGN (MEMMOVE, 5, 0)
- CALL_MCOUNT 3
-
-L(_memmove):
- subf r9,r4,r3
- cmpld cr7,r9,r5
- blt cr7,L(memmove_bwd)
-
- cmpldi cr1,r5,31
- neg 0,3
- ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
- code. */
-
- andi. 10,3,15
- clrldi 11,4,60
- cmpld cr6,10,11 /* SRC and DST alignments match? */
-
- mr r11,3
- bne cr6,L(copy_GE_32_unaligned)
- beq L(aligned_copy)
-
- mtocrf 0x01,0
- clrldi 0,0,60
-
-/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
-1:
- bf 31,2f
- lbz 6,0(r4)
- addi r4,r4,1
- stb 6,0(r11)
- addi r11,r11,1
-2:
- bf 30,4f
- lhz 6,0(r4)
- addi r4,r4,2
- sth 6,0(r11)
- addi r11,r11,2
-4:
- bf 29,8f
- lwz 6,0(r4)
- addi r4,r4,4
- stw 6,0(r11)
- addi r11,r11,4
-8:
- bf 28,16f
- ld 6,0(r4)
- addi r4,r4,8
- std 6,0(r11)
- addi r11,r11,8
-16:
- subf r5,0,r5
-
-/* Main aligned copy loop. Copies 128 bytes at a time. */
-L(aligned_copy):
- li 6,16
- li 7,32
- li 8,48
- mtocrf 0x02,r5
- srdi 12,r5,7
- cmpdi 12,0
- beq L(aligned_tail)
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
- mtctr 12
- b L(aligned_128loop)
-
- .align 4
-L(aligned_128head):
- /* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
-L(aligned_128loop):
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
- stxvd2x 6,0,r11
- addi r4,r4,64
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
- addi r11,r11,64
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
- addi r4,r4,64
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
- addi r11,r11,64
- bdnz L(aligned_128head)
-
-L(aligned_tail):
- mtocrf 0x01,r5
- bf 25,32f
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
- addi r4,r4,64
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
- addi r11,r11,64
-32:
- bf 26,16f
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
- addi r4,r4,32
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
- addi r11,r11,32
-16:
- bf 27,8f
- lxvd2x 6,0,r4
- addi r4,r4,16
- stxvd2x 6,0,r11
- addi r11,r11,16
-8:
- bf 28,4f
- ld 6,0(r4)
- addi r4,r4,8
- std 6,0(r11)
- addi r11,r11,8
-4: /* Copies 4~7 bytes. */
- bf 29,L(tail2)
- lwz 6,0(r4)
- stw 6,0(r11)
- bf 30,L(tail5)
- lhz 7,4(r4)
- sth 7,4(r11)
- bflr 31
- lbz 8,6(r4)
- stb 8,6(r11)
- /* Return original DST pointer. */
- blr
-
-/* Handle copies of 0~31 bytes. */
- .align 4
-L(copy_LT_32):
- mr r11,3
- cmpldi cr6,r5,8
- mtocrf 0x01,r5
- ble cr6,L(copy_LE_8)
-
- /* At least 9 bytes to go. */
- neg 8,4
- andi. 0,8,3
- cmpldi cr1,r5,16
- beq L(copy_LT_32_aligned)
-
- /* Force 4-byte alignment for SRC. */
- mtocrf 0x01,0
- subf r5,0,r5
-2:
- bf 30,1f
- lhz 6,0(r4)
- addi r4,r4,2
- sth 6,0(r11)
- addi r11,r11,2
-1:
- bf 31,L(end_4bytes_alignment)
- lbz 6,0(r4)
- addi r4,r4,1
- stb 6,0(r11)
- addi r11,r11,1
-
- .align 4
-L(end_4bytes_alignment):
- cmpldi cr1,r5,16
- mtocrf 0x01,r5
-
-L(copy_LT_32_aligned):
- /* At least 6 bytes to go, and SRC is word-aligned. */
- blt cr1,8f
-
- /* Copy 16 bytes. */
- lwz 6,0(r4)
- lwz 7,4(r4)
- stw 6,0(r11)
- lwz 8,8(r4)
- stw 7,4(r11)
- lwz 6,12(r4)
- addi r4,r4,16
- stw 8,8(r11)
- stw 6,12(r11)
- addi r11,r11,16
-8: /* Copy 8 bytes. */
- bf 28,L(tail4)
- lwz 6,0(r4)
- lwz 7,4(r4)
- addi r4,r4,8
- stw 6,0(r11)
- stw 7,4(r11)
- addi r11,r11,8
-
- .align 4
-/* Copies 4~7 bytes. */
-L(tail4):
- bf 29,L(tail2)
- lwz 6,0(r4)
- stw 6,0(r11)
- bf 30,L(tail5)
- lhz 7,4(r4)
- sth 7,4(r11)
- bflr 31
- lbz 8,6(r4)
- stb 8,6(r11)
- /* Return original DST pointer. */
- blr
-
- .align 4
-/* Copies 2~3 bytes. */
-L(tail2):
- bf 30,1f
- lhz 6,0(r4)
- sth 6,0(r11)
- bflr 31
- lbz 7,2(r4)
- stb 7,2(r11)
- blr
-
- .align 4
-L(tail5):
- bflr 31
- lbz 6,4(r4)
- stb 6,4(r11)
- blr
-
- .align 4
-1:
- bflr 31
- lbz 6,0(r4)
- stb 6,0(r11)
- /* Return original DST pointer. */
- blr
-
-/* Handles copies of 0~8 bytes. */
- .align 4
-L(copy_LE_8):
- bne cr6,L(tail4)
-
- /* Though we could've used ld/std here, they are still
- slow for unaligned cases. */
-
- lwz 6,0(r4)
- lwz 7,4(r4)
- stw 6,0(r11)
- stw 7,4(r11)
- blr
-
-
-/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
- SRC is not. Use aligned quadword loads from SRC, shifted to realign
- the data, allowing for aligned DST stores. */
- .align 4
-L(copy_GE_32_unaligned):
- clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */
- srdi 9,r5,4 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_unaligned_cont)
-
- /* DST is not quadword aligned, get it aligned. */
-
- mtocrf 0x01,0
- subf r5,0,r5
-
- /* Vector instructions work best when proper alignment (16-bytes)
- is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
-1:
- bf 31,2f
- lbz 6,0(r4)
- addi r4,r4,1
- stb 6,0(r11)
- addi r11,r11,1
-2:
- bf 30,4f
- lhz 6,0(r4)
- addi r4,r4,2
- sth 6,0(r11)
- addi r11,r11,2
-4:
- bf 29,8f
- lwz 6,0(r4)
- addi r4,r4,4
- stw 6,0(r11)
- addi r11,r11,4
-8:
- bf 28,0f
- ld 6,0(r4)
- addi r4,r4,8
- std 6,0(r11)
- addi r11,r11,8
-0:
- srdi 9,r5,4 /* Number of full quadwords remaining. */
-
- /* The proper alignment is present, it is OK to copy the bytes now. */
-L(copy_GE_32_unaligned_cont):
-
- /* Setup two indexes to speed up the indexed vector operations. */
- clrldi 10,r5,60
- li 6,16 /* Index for 16-bytes offsets. */
- li 7,32 /* Index for 32-bytes offsets. */
- cmpldi cr1,10,0
- srdi 8,r5,5 /* Setup the loop counter. */
- mtocrf 0x01,9
- cmpldi cr6,9,1
-#ifdef __LITTLE_ENDIAN__
- lvsr 5,0,r4
-#else
- lvsl 5,0,r4
-#endif
- lvx 3,0,r4
- li 0,0
- bf 31,L(setup_unaligned_loop)
-
- /* Copy another 16 bytes to align to 32-bytes due to the loop. */
- lvx 4,r4,6
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- addi r4,r4,16
- stvx 6,0,r11
- addi r11,r11,16
- vor 3,4,4
- clrrdi 0,r4,60
-
-L(setup_unaligned_loop):
- mtctr 8
- ble cr6,L(end_unaligned_loop)
-
- /* Copy 32 bytes at a time using vector instructions. */
- .align 4
-L(unaligned_loop):
-
- /* Note: vr6/vr10 may contain data that was already copied,
- but in order to get proper alignment, we may have to copy
- some portions again. This is faster than having unaligned
- vector instructions though. */
-
- lvx 4,r4,6
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- lvx 3,r4,7
-#ifdef __LITTLE_ENDIAN__
- vperm 10,3,4,5
-#else
- vperm 10,4,3,5
-#endif
- addi r4,r4,32
- stvx 6,0,r11
- stvx 10,r11,6
- addi r11,r11,32
- bdnz L(unaligned_loop)
-
- clrrdi 0,r4,60
-
- .align 4
-L(end_unaligned_loop):
-
- /* Check for tail bytes. */
- mtocrf 0x01,r5
- beqlr cr1
-
- add r4,r4,0
-
- /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
- /* Copy 8 bytes. */
- bf 28,4f
- lwz 6,0(r4)
- lwz 7,4(r4)
- addi r4,r4,8
- stw 6,0(r11)
- stw 7,4(r11)
- addi r11,r11,8
-4: /* Copy 4~7 bytes. */
- bf 29,L(tail2)
- lwz 6,0(r4)
- stw 6,0(r11)
- bf 30,L(tail5)
- lhz 7,4(r4)
- sth 7,4(r11)
- bflr 31
- lbz 8,6(r4)
- stb 8,6(r11)
- /* Return original DST pointer. */
- blr
-
- /* Start to memcpy backward implementation: the algorith first check if
- src and dest have the same alignment and if it does align both to 16
- bytes and copy using VSX instructions.
- If does not, align dest to 16 bytes and use VMX (altivec) instruction
- to read two 16 bytes at time, shift/permute the bytes read and write
- aligned to dest. */
-L(memmove_bwd):
- cmpldi cr1,r5,31
- /* Copy is done backwards: update the pointers and check alignment. */
- add r11,r3,r5
- add r4,r4,r5
- mr r0,r11
- ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move
- code. */
-
- andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */
- clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */
- cmpld cr6,r10,r9 /* SRC and DST alignments match? */
-
- bne cr6,L(copy_GE_32_unaligned_bwd)
- beq L(aligned_copy_bwd)
-
- mtocrf 0x01,r0
- clrldi r0,r0,60
-
-/* Get the DST and SRC aligned to 16 bytes. */
-1:
- bf 31,2f
- lbz r6,-1(r4)
- subi r4,r4,1
- stb r6,-1(r11)
- subi r11,r11,1
-2:
- bf 30,4f
- lhz r6,-2(r4)
- subi r4,r4,2
- sth r6,-2(r11)
- subi r11,r11,2
-4:
- bf 29,8f
- lwz r6,-4(r4)
- subi r4,r4,4
- stw r6,-4(r11)
- subi r11,r11,4
-8:
- bf 28,16f
- ld r6,-8(r4)
- subi r4,r4,8
- std r6,-8(r11)
- subi r11,r11,8
-16:
- subf r5,0,r5
-
-/* Main aligned copy loop. Copies 128 bytes at a time. */
-L(aligned_copy_bwd):
- li r6,-16
- li r7,-32
- li r8,-48
- li r9,-64
- mtocrf 0x02,r5
- srdi r12,r5,7
- cmpdi r12,0
- beq L(aligned_tail_bwd)
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
- mtctr 12
- b L(aligned_128loop_bwd)
-
- .align 4
-L(aligned_128head_bwd):
- /* for the 2nd + iteration of this loop. */
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
-L(aligned_128loop_bwd):
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
- stxvd2x v6,r11,r6
- subi r4,r4,64
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,7
- subi r11,r11,64
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
- subi r4,r4,64
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
- subi r11,r11,64
- bdnz L(aligned_128head_bwd)
-
-L(aligned_tail_bwd):
- mtocrf 0x01,r5
- bf 25,32f
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
- subi r4,r4,64
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
- subi r11,r11,64
-32:
- bf 26,16f
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
- subi r4,r4,32
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
- subi r11,r11,32
-16:
- bf 27,8f
- lxvd2x v6,r4,r6
- subi r4,r4,16
- stxvd2x v6,r11,r6
- subi r11,r11,16
-8:
- bf 28,4f
- ld r6,-8(r4)
- subi r4,r4,8
- std r6,-8(r11)
- subi r11,r11,8
-4: /* Copies 4~7 bytes. */
- bf 29,L(tail2_bwd)
- lwz r6,-4(r4)
- stw r6,-4(r11)
- bf 30,L(tail5_bwd)
- lhz r7,-6(r4)
- sth r7,-6(r11)
- bflr 31
- lbz r8,-7(r4)
- stb r8,-7(r11)
- /* Return original DST pointer. */
- blr
-
-/* Handle copies of 0~31 bytes. */
- .align 4
-L(copy_LT_32_bwd):
- cmpldi cr6,r5,8
- mtocrf 0x01,r5
- ble cr6,L(copy_LE_8_bwd)
-
- /* At least 9 bytes to go. */
- neg r8,r4
- andi. r0,r8,3
- cmpldi cr1,r5,16
- beq L(copy_LT_32_aligned_bwd)
-
- /* Force 4-byte alignment for SRC. */
- mtocrf 0x01,0
- subf r5,0,r5
-2:
- bf 30,1f
- lhz r6,-2(r4)
- subi r4,r4,2
- sth r6,-2(r11)
- subi r11,r11,2
-1:
- bf 31,L(end_4bytes_alignment_bwd)
- lbz 6,-1(r4)
- subi r4,r4,1
- stb 6,-1(r11)
- subi r11,r11,1
-
- .align 4
-L(end_4bytes_alignment_bwd):
- cmpldi cr1,r5,16
- mtocrf 0x01,r5
-
-L(copy_LT_32_aligned_bwd):
- /* At least 6 bytes to go, and SRC is word-aligned. */
- blt cr1,8f
-
- /* Copy 16 bytes. */
- lwz r6,-4(r4)
- lwz r7,-8(r4)
- stw r6,-4(r11)
- lwz r8,-12(r4)
- stw r7,-8(r11)
- lwz r6,-16(r4)
- subi r4,r4,16
- stw r8,-12(r11)
- stw r6,-16(r11)
- subi r11,r11,16
-8: /* Copy 8 bytes. */
- bf 28,L(tail4_bwd)
- lwz r6,-4(r4)
- lwz r7,-8(r4)
- subi r4,r4,8
- stw r6,-4(r11)
- stw r7,-8(r11)
- subi r11,r11,8
-
- .align 4
-/* Copies 4~7 bytes. */
-L(tail4_bwd):
- bf 29,L(tail2_bwd)
- lwz 6,-4(r4)
- stw 6,-4(r11)
- bf 30,L(tail5_bwd)
- lhz 7,-6(r4)
- sth 7,-6(r11)
- bflr 31
- lbz 8,-7(r4)
- stb 8,-7(r11)
- /* Return original DST pointer. */
- blr
-
- .align 4
-/* Copies 2~3 bytes. */
-L(tail2_bwd):
- bf 30,1f
- lhz 6,-2(r4)
- sth 6,-2(r11)
- bflr 31
- lbz 7,-3(r4)
- stb 7,-3(r11)
- blr
-
- .align 4
-L(tail5_bwd):
- bflr 31
- lbz 6,-5(r4)
- stb 6,-5(r11)
- blr
-
- .align 4
-1:
- bflr 31
- lbz 6,-1(r4)
- stb 6,-1(r11)
- /* Return original DST pointer. */
- blr
-
-
-/* Handles copies of 0~8 bytes. */
- .align 4
-L(copy_LE_8_bwd):
- bne cr6,L(tail4_bwd)
-
- /* Though we could've used ld/std here, they are still
- slow for unaligned cases. */
- lwz 6,-8(r4)
- lwz 7,-4(r4)
- stw 6,-8(r11)
- stw 7,-4(r11)
- blr
-
-
-/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
- SRC is not. Use aligned quadword loads from SRC, shifted to realign
- the data, allowing for aligned DST stores. */
- .align 4
-L(copy_GE_32_unaligned_bwd):
- andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */
- srdi r9,r5,4 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_unaligned_cont_bwd)
-
- /* DST is not quadword aligned and r10 holds the address masked to
- compare alignments. */
- mtocrf 0x01,r10
- subf r5,r10,r5
-
- /* Vector instructions work best when proper alignment (16-bytes)
- is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
-1:
- bf 31,2f
- lbz r6,-1(r4)
- subi r4,r4,1
- stb r6,-1(r11)
- subi r11,r11,1
-2:
- bf 30,4f
- lhz r6,-2(r4)
- subi r4,r4,2
- sth r6,-2(r11)
- subi r11,r11,2
-4:
- bf 29,8f
- lwz r6,-4(r4)
- subi r4,r4,4
- stw r6,-4(r11)
- subi r11,r11,4
-8:
- bf 28,0f
- ld r6,-8(r4)
- subi r4,r4,8
- std r6,-8(r11)
- subi r11,r11,8
-0:
- srdi r9,r5,4 /* Number of full quadwords remaining. */
-
- /* The proper alignment is present, it is OK to copy the bytes now. */
-L(copy_GE_32_unaligned_cont_bwd):
-
- /* Setup two indexes to speed up the indexed vector operations. */
- clrldi r10,r5,60
- li r6,-16 /* Index for 16-bytes offsets. */
- li r7,-32 /* Index for 32-bytes offsets. */
- cmpldi cr1,10,0
- srdi r8,r5,5 /* Setup the loop counter. */
- mtocrf 0x01,9
- cmpldi cr6,r9,1
-#ifdef __LITTLE_ENDIAN__
- lvsr v5,r0,r4
-#else
- lvsl v5,r0,r4
-#endif
- lvx v3,0,r4
- li r0,0
- bf 31,L(setup_unaligned_loop_bwd)
-
- /* Copy another 16 bytes to align to 32-bytes due to the loop. */
- lvx v4,r4,r6
-#ifdef __LITTLE_ENDIAN__
- vperm v6,v3,v4,v5
-#else
- vperm v6,v4,v3,v5
-#endif
- subi r4,r4,16
- stvx v6,r11,r6
- subi r11,r11,16
- vor v3,v4,v4
- clrrdi r0,r4,60
-
-L(setup_unaligned_loop_bwd):
- mtctr r8
- ble cr6,L(end_unaligned_loop_bwd)
-
- /* Copy 32 bytes at a time using vector instructions. */
- .align 4
-L(unaligned_loop_bwd):
-
- /* Note: vr6/vr10 may contain data that was already copied,
- but in order to get proper alignment, we may have to copy
- some portions again. This is faster than having unaligned
- vector instructions though. */
-
- lvx v4,r4,r6
-#ifdef __LITTLE_ENDIAN__
- vperm v6,v3,v4,v5
-#else
- vperm v6,v4,v3,v5
-#endif
- lvx v3,r4,r7
-#ifdef __LITTLE_ENDIAN__
- vperm v10,v4,v3,v5
-#else
- vperm v10,v3,v4,v5
-#endif
- subi r4,r4,32
- stvx v6,r11,r6
- stvx v10,r11,r7
- subi r11,r11,32
- bdnz L(unaligned_loop_bwd)
-
- clrrdi r0,r4,60
-
- .align 4
-L(end_unaligned_loop_bwd):
-
- /* Check for tail bytes. */
- mtocrf 0x01,r5
- beqlr cr1
-
- add r4,r4,0
-
- /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
- /* Copy 8 bytes. */
- bf 28,4f
- lwz r6,-4(r4)
- lwz r7,-8(r4)
- subi r4,r4,8
- stw r6,-4(r11)
- stw r7,-8(r11)
- subi r11,r11,8
-4: /* Copy 4~7 bytes. */
- bf 29,L(tail2_bwd)
- lwz r6,-4(r4)
- stw r6,-4(r11)
- bf 30,L(tail5_bwd)
- lhz r7,-6(r4)
- sth r7,-6(r11)
- bflr 31
- lbz r8,-7(r4)
- stb r8,-7(r11)
- /* Return original DST pointer. */
- blr
-END_GEN_TB (MEMMOVE, TB_TOCLESS)
-libc_hidden_builtin_def (memmove)
-
-
-/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
- Implemented in this file to avoid linker create a stub function call
- in the branch to '_memmove'. */
-ENTRY (__bcopy)
- mr r6,r3
- mr r3,r4
- mr r4,r6
- b L(_memmove)
-END (__bcopy)
-weak_alias (__bcopy, bcopy)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
deleted file mode 100644
index 4e15d1e40c..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ /dev/null
@@ -1,472 +0,0 @@
-/* Optimized mempcpy implementation for POWER7.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
-/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
- Returns 'dst' + 'len'. */
-
-#ifndef MEMPCPY
-# define MEMPCPY __mempcpy
-#endif
- .machine power7
-EALIGN (MEMPCPY, 5, 0)
- CALL_MCOUNT 3
-
- cmpldi cr1,5,31
- neg 0,3
- std 3,-16(1)
- std 31,-8(1)
- cfi_offset(31,-8)
- ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move
- code. */
-
- andi. 11,3,7 /* Check alignment of DST. */
-
-
- clrldi 10,4,61 /* Check alignment of SRC. */
- cmpld cr6,10,11 /* SRC and DST alignments match? */
- mr 12,4
- mr 31,5
- bne cr6,L(copy_GE_32_unaligned)
-
- srdi 9,5,3 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_aligned_cont)
-
- clrldi 0,0,61
- mtcrf 0x01,0
- subf 31,0,5
-
- /* Get the SRC aligned to 8 bytes. */
-
-1: bf 31,2f
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-2: bf 30,4f
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-4: bf 29,0f
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-0:
- clrldi 10,12,61 /* Check alignment of SRC again. */
- srdi 9,31,3 /* Number of full doublewords remaining. */
-
-L(copy_GE_32_aligned_cont):
-
- clrldi 11,31,61
- mtcrf 0x01,9
-
- srdi 8,31,5
- cmpldi cr1,9,4
- cmpldi cr6,11,0
- mr 11,12
-
- /* Copy 1~3 doublewords so the main loop starts
- at a multiple of 32 bytes. */
-
- bf 30,1f
- ld 6,0(12)
- ld 7,8(12)
- addi 11,12,16
- mtctr 8
- std 6,0(3)
- std 7,8(3)
- addi 10,3,16
- bf 31,4f
- ld 0,16(12)
- std 0,16(3)
- blt cr1,3f
- addi 11,12,24
- addi 10,3,24
- b 4f
-
- .align 4
-1: /* Copy 1 doubleword and set the counter. */
- mr 10,3
- mtctr 8
- bf 31,4f
- ld 6,0(12)
- addi 11,12,8
- std 6,0(3)
- addi 10,3,8
-
- /* Main aligned copy loop. Copies 32-bytes at a time. */
- .align 4
-4:
- ld 6,0(11)
- ld 7,8(11)
- ld 8,16(11)
- ld 0,24(11)
- addi 11,11,32
-
- std 6,0(10)
- std 7,8(10)
- std 8,16(10)
- std 0,24(10)
- addi 10,10,32
- bdnz 4b
-3:
-
- /* Check for tail bytes. */
- rldicr 0,31,0,60
- mtcrf 0x01,31
- beq cr6,0f
-
-.L9:
- add 3,3,0
- add 12,12,0
-
- /* At this point we have a tail of 0-7 bytes and we know that the
- destination is doubleword-aligned. */
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2 bytes. */
- bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return DST + LEN pointer. */
- ld 31,-8(1)
- ld 3,-16(1)
- add 3,3,5
- blr
-
- /* Handle copies of 0~31 bytes. */
- .align 4
-L(copy_LT_32):
- cmpldi cr6,5,8
- mr 12,4
- mtcrf 0x01,5
- ble cr6,L(copy_LE_8)
-
- /* At least 9 bytes to go. */
- neg 8,4
- clrrdi 11,4,2
- andi. 0,8,3
- cmpldi cr1,5,16
- mr 10,5
- beq L(copy_LT_32_aligned)
-
- /* Force 4-bytes alignment for SRC. */
- mtocrf 0x01,0
- subf 10,0,5
-2: bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: bf 31,L(end_4bytes_alignment)
-
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-
- .align 4
-L(end_4bytes_alignment):
- cmpldi cr1,10,16
- mtcrf 0x01,10
-
-L(copy_LT_32_aligned):
- /* At least 6 bytes to go, and SRC is word-aligned. */
- blt cr1,8f
-
- /* Copy 16 bytes. */
- lwz 6,0(12)
- lwz 7,4(12)
- stw 6,0(3)
- lwz 8,8(12)
- stw 7,4(3)
- lwz 6,12(12)
- addi 12,12,16
- stw 8,8(3)
- stw 6,12(3)
- addi 3,3,16
-8: /* Copy 8 bytes. */
- bf 28,4f
-
- lwz 6,0(12)
- lwz 7,4(12)
- addi 12,12,8
- stw 6,0(3)
- stw 7,4(3)
- addi 3,3,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2-3 bytes. */
- bf 30,1f
-
- lhz 6,0(12)
- sth 6,0(3)
- bf 31,0f
- lbz 7,2(12)
- stb 7,2(3)
- ld 3,-16(1)
- add 3,3,5
- blr
-
- .align 4
-1: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return DST + LEN pointer. */
- ld 3,-16(1)
- add 3,3,5
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(copy_LE_8):
- bne cr6,4f
-
- /* Though we could've used ld/std here, they are still
- slow for unaligned cases. */
-
- lwz 6,0(4)
- lwz 7,4(4)
- stw 6,0(3)
- stw 7,4(3)
- ld 3,-16(1) /* Return DST + LEN pointer. */
- add 3,3,5
- blr
-
- .align 4
-4: /* Copies 4~7 bytes. */
- bf 29,2b
-
- lwz 6,0(4)
- stw 6,0(3)
- bf 30,5f
- lhz 7,4(4)
- sth 7,4(3)
- bf 31,0f
- lbz 8,6(4)
- stb 8,6(3)
- ld 3,-16(1)
- add 3,3,5
- blr
-
- .align 4
-5: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,4(4)
- stb 6,4(3)
-
-0: /* Return DST + LEN pointer. */
- ld 3,-16(1)
- add 3,3,5
- blr
-
- /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
- SRC is not. Use aligned quadword loads from SRC, shifted to realign
- the data, allowing for aligned DST stores. */
- .align 4
-L(copy_GE_32_unaligned):
- clrldi 0,0,60 /* Number of bytes until the 1st
- quadword. */
- andi. 11,3,15 /* Check alignment of DST (against
- quadwords). */
- srdi 9,5,4 /* Number of full quadwords remaining. */
-
- beq L(copy_GE_32_unaligned_cont)
-
- /* SRC is not quadword aligned, get it aligned. */
-
- mtcrf 0x01,0
- subf 31,0,5
-
- /* Vector instructions work best when proper alignment (16-bytes)
- is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
-1: /* Copy 1 byte. */
- bf 31,2f
-
- lbz 6,0(12)
- addi 12,12,1
- stb 6,0(3)
- addi 3,3,1
-2: /* Copy 2 bytes. */
- bf 30,4f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-4: /* Copy 4 bytes. */
- bf 29,8f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-8: /* Copy 8 bytes. */
- bf 28,0f
-
- ld 6,0(12)
- addi 12,12,8
- std 6,0(3)
- addi 3,3,8
-0:
- clrldi 10,12,60 /* Check alignment of SRC. */
- srdi 9,31,4 /* Number of full quadwords remaining. */
-
- /* The proper alignment is present, it is OK to copy the bytes now. */
-L(copy_GE_32_unaligned_cont):
-
- /* Setup two indexes to speed up the indexed vector operations. */
- clrldi 11,31,60
- li 6,16 /* Index for 16-bytes offsets. */
- li 7,32 /* Index for 32-bytes offsets. */
- cmpldi cr1,11,0
- srdi 8,31,5 /* Setup the loop counter. */
- mr 10,3
- mr 11,12
- mtcrf 0x01,9
- cmpldi cr6,9,1
-#ifdef __LITTLE_ENDIAN__
- lvsr 5,0,12
-#else
- lvsl 5,0,12
-#endif
- lvx 3,0,12
- bf 31,L(setup_unaligned_loop)
-
- /* Copy another 16 bytes to align to 32-bytes due to the loop . */
- lvx 4,12,6
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- addi 11,12,16
- addi 10,3,16
- stvx 6,0,3
- vor 3,4,4
-
-L(setup_unaligned_loop):
- mtctr 8
- ble cr6,L(end_unaligned_loop)
-
- /* Copy 32 bytes at a time using vector instructions. */
- .align 4
-L(unaligned_loop):
-
- /* Note: vr6/vr10 may contain data that was already copied,
- but in order to get proper alignment, we may have to copy
- some portions again. This is faster than having unaligned
- vector instructions though. */
-
- lvx 4,11,6 /* vr4 = r11+16. */
-#ifdef __LITTLE_ENDIAN__
- vperm 6,4,3,5
-#else
- vperm 6,3,4,5
-#endif
- lvx 3,11,7 /* vr3 = r11+32. */
-#ifdef __LITTLE_ENDIAN__
- vperm 10,3,4,5
-#else
- vperm 10,4,3,5
-#endif
- addi 11,11,32
- stvx 6,0,10
- stvx 10,10,6
- addi 10,10,32
-
- bdnz L(unaligned_loop)
-
- .align 4
-L(end_unaligned_loop):
-
- /* Check for tail bytes. */
- rldicr 0,31,0,59
- mtcrf 0x01,31
- beq cr1,0f
-
- add 3,3,0
- add 12,12,0
-
- /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
-8: /* Copy 8 bytes. */
- bf 28,4f
-
- lwz 6,0(12)
- lwz 7,4(12)
- addi 12,12,8
- stw 6,0(3)
- stw 7,4(3)
- addi 3,3,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- lwz 6,0(12)
- addi 12,12,4
- stw 6,0(3)
- addi 3,3,4
-2: /* Copy 2~3 bytes. */
- bf 30,1f
-
- lhz 6,0(12)
- addi 12,12,2
- sth 6,0(3)
- addi 3,3,2
-1: /* Copy 1 byte. */
- bf 31,0f
-
- lbz 6,0(12)
- stb 6,0(3)
-0: /* Return DST + LEN pointer. */
- ld 31,-8(1)
- ld 3,-16(1)
- add 3,3,5
- blr
-
-END_GEN_TB (MEMPCPY,TB_TOCLESS)
-libc_hidden_def (__mempcpy)
-weak_alias (__mempcpy, mempcpy)
-libc_hidden_builtin_def (mempcpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
deleted file mode 100644
index 4276768915..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5]) */
-
-#ifndef MEMRCHR
-# define MEMRCHR __memrchr
-#endif
- .machine power7
-ENTRY (MEMRCHR)
- CALL_MCOUNT 3
- add r7,r3,r5 /* Calculate the last acceptable address. */
- neg r0,r7
- addi r7,r7,-1
- mr r10,r3
- clrrdi r6,r7,7
- li r9,3<<5
- dcbt r9,r6,8 /* Stream hint, decreasing addresses. */
-
- /* Replicate BYTE to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
- li r6,-8
- li r9,-1
- rlwinm r0,r0,3,26,28 /* Calculate padding. */
- clrrdi r8,r7,3
- srd r9,r9,r0
- cmpldi r5,32
- clrrdi r0,r10,3
- ble L(small_range)
-
-#ifdef __LITTLE_ENDIAN__
- ldx r12,0,r8
-#else
- ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
-#endif
- cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
- and r3,r3,r9
- cmpldi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
- bf 28,L(loop_setup)
-
- /* Handle DWORD2 of pair. */
-#ifdef __LITTLE_ENDIAN__
- ldx r12,r8,r6
-#else
- ldbrx r12,r8,r6
-#endif
- addi r8,r8,-8
- cmpb r3,r12,r4
- cmpldi cr7,r3,0
- bne cr7,L(done)
-
-L(loop_setup):
- /* The last dword we want to read in the loop below is the one
- containing the first byte of the string, ie. the dword at
- s & ~7, or r0. The first dword read is at r8 - 8, we
- read 2 * cnt dwords, so the last dword read will be at
- r8 - 8 - 16 * cnt + 8. Solving for cnt gives
- cnt = (r8 - r0) / 16 */
- sub r5,r8,r0
- addi r8,r8,-8
- srdi r9,r5,4 /* Number of loop iterations. */
- mtctr r9 /* Setup the counter. */
-
- /* Main loop to look for BYTE backwards in the string.
- FIXME: Investigate whether 32 byte align helps with this
- 9 instruction loop. */
- .align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the byte-checking process for bigger strings. */
-
-#ifdef __LITTLE_ENDIAN__
- ldx r12,0,r8
- ldx r11,r8,r6
-#else
- ldbrx r12,0,r8
- ldbrx r11,r8,r6
-#endif
- cmpb r3,r12,r4
- cmpb r9,r11,r4
- or r5,r9,r3 /* Merge everything in one doubleword. */
- cmpldi cr7,r5,0
- bne cr7,L(found)
- addi r8,r8,-16
- bdnz L(loop)
-
- /* We may have one more word to read. */
- cmpld r8,r0
- bnelr
-
-#ifdef __LITTLE_ENDIAN__
- ldx r12,0,r8
-#else
- ldbrx r12,0,r8
-#endif
- cmpb r3,r12,r4
- cmpldi cr7,r3,0
- bne cr7,L(done)
- blr
-
- .align 4
-L(found):
- /* OK, one (or both) of the dwords contains BYTE. Check
- the first dword. */
- cmpldi cr6,r3,0
- bne cr6,L(done)
-
- /* BYTE must be in the second word. Adjust the address
- again and move the result of cmpb to r3 so we can calculate the
- pointer. */
-
- mr r3,r9
- addi r8,r8,-8
-
- /* r3 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as BYTE in the original
- word from the string. Use that to calculate the pointer.
- We need to make sure BYTE is *before* the end of the
- range. */
-L(done):
- cntlzd r9,r3 /* Count leading zeros before the match. */
- cmpld r8,r0 /* Are we on the last word? */
- srdi r6,r9,3 /* Convert leading zeros to bytes. */
- addi r0,r6,-7
- sub r3,r8,r0
- cmpld cr7,r3,r10
- bnelr
- bgelr cr7
- li r3,0
- blr
-
- .align 4
-L(null):
- li r3,0
- blr
-
-/* Deals with size <= 32. */
- .align 4
-L(small_range):
- cmpldi r5,0
- beq L(null)
-
-#ifdef __LITTLE_ENDIAN__
- ldx r12,0,r8
-#else
- ldbrx r12,0,r8 /* Load reversed doubleword from memory. */
-#endif
- cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
- and r3,r3,r9
- cmpldi cr7,r3,0
- bne cr7,L(done)
-
- /* Are we done already? */
- cmpld r8,r0
- addi r8,r8,-8
- beqlr
-
- .align 5
-L(loop_small):
-#ifdef __LITTLE_ENDIAN__
- ldx r12,0,r8
-#else
- ldbrx r12,0,r8
-#endif
- cmpb r3,r12,r4
- cmpld r8,r0
- cmpldi cr7,r3,0
- bne cr7,L(done)
- addi r8,r8,-8
- bne L(loop_small)
- blr
-
-END (MEMRCHR)
-weak_alias (__memrchr, memrchr)
-libc_hidden_builtin_def (memrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
deleted file mode 100644
index 21933c0672..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ /dev/null
@@ -1,399 +0,0 @@
-/* Optimized memset implementation for PowerPC64/POWER7.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
- Returns 's'. */
-
-#ifndef MEMSET
-# define MEMSET memset
-#endif
- .machine power7
-EALIGN (MEMSET, 5, 0)
- CALL_MCOUNT 3
-
-L(_memset):
- cmpldi cr7,5,31
- cmpldi cr6,5,8
- mr 10,3
-
- /* Replicate byte to word. */
- insrdi 4,4,8,48
- insrdi 4,4,16,32
- ble cr6,L(small) /* If length <= 8, use short copy code. */
-
- neg 0,3
- ble cr7,L(medium) /* If length < 32, use medium copy code. */
-
- andi. 11,10,7 /* Check alignment of SRC. */
- insrdi 4,4,32,0 /* Replicate word to double word. */
-
- mr 12,5
- beq L(big_aligned)
-
- clrldi 0,0,61
- mtocrf 0x01,0
- subf 5,0,5
-
- /* Get DST aligned to 8 bytes. */
-1: bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: bf 30,4f
-
- sth 4,0(10)
- addi 10,10,2
-4: bf 29,L(big_aligned)
-
- stw 4,0(10)
- addi 10,10,4
-
- .align 4
-L(big_aligned):
-
- cmpldi cr5,5,255
- li 0,32
- dcbtst 0,10
- cmpldi cr6,4,0
- srdi 9,5,3 /* Number of full doublewords remaining. */
- crand 27,26,21
- mtocrf 0x01,9
- bt 27,L(huge)
-
- /* From this point on, we'll copy 32+ bytes and the value
- isn't 0 (so we can't use dcbz). */
-
- srdi 8,5,5
- clrldi 11,5,61
- cmpldi cr6,11,0
- cmpldi cr1,9,4
- mtctr 8
-
- /* Copy 1~3 doublewords so the main loop starts
- at a multiple of 32 bytes. */
-
- bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- bf 31,L(big_loop)
-
- std 4,0(10)
- addi 10,10,8
- mr 12,10
- blt cr1,L(tail_bytes)
- b L(big_loop)
-
- .align 4
-1: /* Copy 1 doubleword. */
- bf 31,L(big_loop)
-
- std 4,0(10)
- addi 10,10,8
-
- /* Main aligned copy loop. Copies 32-bytes at a time and
- ping-pong through r10 and r12 to avoid AGEN delays. */
- .align 4
-L(big_loop):
- addi 12,10,32
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- bdz L(tail_bytes)
-
- addi 10,10,64
- std 4,0(12)
- std 4,8(12)
- std 4,16(12)
- std 4,24(12)
- bdnz L(big_loop)
-
- mr 12,10
- b L(tail_bytes)
-
- .align 4
-L(tail_bytes):
-
- /* Check for tail bytes. */
- beqlr cr6
-
- clrldi 0,5,61
- mtocrf 0x01,0
-
- /* At this point we have a tail of 0-7 bytes and we know that the
- destination is doubleword-aligned. */
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(12)
- addi 12,12,4
-2: /* Copy 2 bytes. */
- bf 30,1f
-
- sth 4,0(12)
- addi 12,12,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(12)
- blr
-
- /* Special case when value is 0 and we have a long length to deal
- with. Use dcbz to zero out 128-bytes at a time. Before using
- dcbz though, we need to get the destination 128-bytes aligned. */
- .align 4
-L(huge):
- andi. 11,10,127
- neg 0,10
- beq L(huge_aligned)
-
- clrldi 0,0,57
- subf 5,0,5
- srdi 0,0,3
- mtocrf 0x01,0
-
- /* Get DST aligned to 128 bytes. */
-8: bf 28,4f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- std 4,32(10)
- std 4,40(10)
- std 4,48(10)
- std 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(huge_aligned)
-
- std 4,0(10)
- addi 10,10,8
-
-
-L(huge_aligned):
- srdi 8,5,7
- clrldi 11,5,57
- cmpldi cr6,11,0
- mtctr 8
-
- .align 4
-L(huge_loop):
- dcbz 0,10
- addi 10,10,128
- bdnz L(huge_loop)
-
- /* Check how many bytes are still left. */
- beqlr cr6
-
- subf 9,3,10
- subf 5,9,12
- srdi 8,5,3
- cmpldi cr6,8,0
- mtocrf 0x01,8
-
- /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
- speed. We'll handle the resulting tail bytes later. */
- beq cr6,L(tail)
-
-8: bf 28,4f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- std 4,32(10)
- std 4,40(10)
- std 4,48(10)
- std 4,56(10)
- addi 10,10,64
- .align 4
-4: bf 29,2f
-
- std 4,0(10)
- std 4,8(10)
- std 4,16(10)
- std 4,24(10)
- addi 10,10,32
- .align 4
-2: bf 30,1f
-
- std 4,0(10)
- std 4,8(10)
- addi 10,10,16
- .align 4
-1: bf 31,L(tail)
-
- std 4,0(10)
- addi 10,10,8
-
- /* Handle the rest of the tail bytes here. */
-L(tail):
- mtocrf 0x01,5
-
- .align 4
-4: bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
- .align 4
-2: bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
- .align 4
-1: bflr 31
-
- stb 4,0(10)
- blr
-
- /* Expanded tree to copy tail bytes without increments. */
- .align 4
-L(copy_tail):
- bf 29,L(FXX)
-
- stw 4,0(10)
- bf 30,L(TFX)
-
- sth 4,4(10)
- bflr 31
-
- stb 4,6(10)
- blr
-
- .align 4
-L(FXX): bf 30,L(FFX)
-
- sth 4,0(10)
- bflr 31
-
- stb 4,2(10)
- blr
-
- .align 4
-L(TFX): bflr 31
-
- stb 4,4(10)
- blr
-
- .align 4
-L(FFX): bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handle copies of 9~31 bytes. */
- .align 4
-L(medium):
- /* At least 9 bytes to go. */
- andi. 11,10,3
- clrldi 0,0,62
- beq L(medium_aligned)
-
- /* Force 4-bytes alignment for DST. */
- mtocrf 0x01,0
- subf 5,0,5
-1: /* Copy 1 byte. */
- bf 31,2f
-
- stb 4,0(10)
- addi 10,10,1
-2: /* Copy 2 bytes. */
- bf 30,L(medium_aligned)
-
- sth 4,0(10)
- addi 10,10,2
-
- .align 4
-L(medium_aligned):
- /* At least 6 bytes to go, and DST is word-aligned. */
- cmpldi cr1,5,16
- mtocrf 0x01,5
- blt cr1,8f
-
- /* Copy 16 bytes. */
- stw 4,0(10)
- stw 4,4(10)
- stw 4,8(10)
- stw 4,12(10)
- addi 10,10,16
-8: /* Copy 8 bytes. */
- bf 28,4f
-
- stw 4,0(10)
- stw 4,4(10)
- addi 10,10,8
-4: /* Copy 4 bytes. */
- bf 29,2f
-
- stw 4,0(10)
- addi 10,10,4
-2: /* Copy 2-3 bytes. */
- bf 30,1f
-
- sth 4,0(10)
- addi 10,10,2
-1: /* Copy 1 byte. */
- bflr 31
-
- stb 4,0(10)
- blr
-
- /* Handles copies of 0~8 bytes. */
- .align 4
-L(small):
- mtocrf 0x01,5
- bne cr6,L(copy_tail)
-
- stw 4,0(10)
- stw 4,4(10)
- blr
-
-END_GEN_TB (MEMSET,TB_TOCLESS)
-libc_hidden_builtin_def (memset)
-
-/* Copied from bzero.S to prevent the linker from inserting a stub
- between bzero and memset. */
-ENTRY (__bzero)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b L(_memset)
-END (__bzero)
-#ifndef __bzero
-weak_alias (__bzero, bzero)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/multiarch/Implies b/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
deleted file mode 100644
index bf5d6171a5..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power6/multiarch
diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
deleted file mode 100644
index 48afb75943..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] rawmemchr (void *s [r3], int c [r4]) */
-
-#ifndef RAWMEMCHR
-# define RAWMEMCHR __rawmemchr
-#endif
- .machine power7
-ENTRY (RAWMEMCHR)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- /* Now r4 has a doubleword of c bytes. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- ld r12,0(r8) /* Load doubleword from memory. */
- cmpb r5,r12,r4 /* Compare each byte against c byte. */
-#ifdef __LITTLE_ENDIAN__
- srd r5,r5,r6
- sld r5,r5,r6
-#else
- sld r5,r5,r6 /* Move left to discard ignored bits. */
- srd r5,r5,r6 /* Bring the bits back as zeros. */
-#endif
- cmpdi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle DWORD2 of pair. */
- ldu r12,8(r8)
- cmpb r5,r12,r4
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the byte-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r5,r12,r4
- cmpb r6,r11,r4
- or r7,r5,r6
- cmpdi cr7,r7,0
- beq cr7,L(loop)
-
- /* OK, one (or both) of the doublewords contains a 'c' byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The 'c' byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
- mr r5,r6
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the 'c' byte in the original
- doubleword from the string. Use that fact to find out what is
- the position of the byte inside the string. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0 /* Count trailing zeros. */
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching char. */
- blr
-END (RAWMEMCHR)
-weak_alias (__rawmemchr,rawmemchr)
-libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
deleted file mode 100644
index a346dd7e28..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/stpncpy.S
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Optimized stpncpy implementation for PowerPC64/POWER7.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_STPNCPY
-#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
-
-weak_alias (__stpncpy, stpncpy)
-libc_hidden_def (__stpncpy)
-libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
deleted file mode 100644
index e856b8a593..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Optimized strcasecmp implementation for PowerPC64.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <locale-defines.h>
-
-/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
-
- or if defined USE_IN_EXTENDED_LOCALE_MODEL:
-
- int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
- __locale_t loc [r5]) */
-
-#ifndef STRCMP
-# define __STRCMP __strcasecmp
-# define STRCMP strcasecmp
-#endif
-
-ENTRY (__STRCMP)
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
- CALL_MCOUNT 2
-#else
- CALL_MCOUNT 3
-#endif
-
-#define rRTN r3 /* Return value */
-#define rSTR1 r5 /* 1st string */
-#define rSTR2 r4 /* 2nd string */
-#define rLOCARG r5 /* 3rd argument: locale_t */
-#define rCHAR1 r6 /* Byte read from 1st string */
-#define rCHAR2 r7 /* Byte read from 2nd string */
-#define rADDR1 r8 /* Address of tolower(rCHAR1) */
-#define rADDR2 r12 /* Address of tolower(rCHAR2) */
-#define rLWR1 r8 /* Word tolower(rCHAR1) */
-#define rLWR2 r12 /* Word tolower(rCHAR2) */
-#define rTMP r9
-#define rLOC r11 /* Default locale address */
-
- cmpd cr7, r3, r4
-#ifndef USE_IN_EXTENDED_LOCALE_MODEL
- ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
- add rLOC, rTMP, __libc_tsd_LOCALE@tls
- ld rLOC, 0(rLOC)
-#else
- mr rLOC, rLOCARG
-#endif
- ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
- mr rSTR1, rRTN
- li rRTN, 0
- beqlr cr7
-
-
- /* Unrolling loop for POWER: loads are done with 'lbz' plus
- offset and string descriptors are only updated in the end
- of loop unrolling. */
-
- lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
- lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
-L(loop):
- cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
- sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
- sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
- lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
- lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
- cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
- crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
- beq cr1, L(done)
- lbz rCHAR1, 1(rSTR1)
- lbz rCHAR2, 1(rSTR2)
- cmpdi rCHAR1, 0
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- lbz rCHAR1, 2(rSTR1)
- lbz rCHAR2, 2(rSTR2)
- cmpdi rCHAR1, 0
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1, L(done)
- lbz rCHAR1, 3(rSTR1)
- lbz rCHAR2, 3(rSTR2)
- cmpdi rCHAR1, 0
- /* Increment both string descriptors */
- addi rSTR1, rSTR1, 4
- addi rSTR2, rSTR2, 4
- sldi rADDR1, rCHAR1, 2
- sldi rADDR2, rCHAR2, 2
- lwzx rLWR1, rLOC, rADDR1
- lwzx rLWR2, rLOC, rADDR2
- cmpw cr1, rLWR1, rLWR2
- crorc 4*cr1+eq,eq,4*cr1+eq
- beq cr1,L(done)
- lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
- lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
- b L(loop)
-L(done):
- subf r0, rLWR2, rLWR1
- extsw rRTN, r0
- blr
-END (__STRCMP)
-
-weak_alias (__STRCMP, STRCMP)
-libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
deleted file mode 100644
index c13c4ebcb8..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#define USE_IN_EXTENDED_LOCALE_MODEL
-#define STRCMP strcasecmp_l
-#define __STRCMP __strcasecmp_l
-
-#include "strcasecmp.S"
diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S
deleted file mode 100644
index a18e2e101c..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strchr.S
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRCHR
-# define STRCHR strchr
-#endif
-
-/* int [r3] strchr (char *s [r3], int c [r4]) */
- .machine power7
-ENTRY (STRCHR)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
- cmpdi cr7,r4,0
- ld r12,0(r8) /* Load doubleword from memory. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
-
- beq cr7,L(null_match)
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- /* Now r4 has a doubleword of c bytes and r0 has
- a doubleword of null bytes. */
-
- cmpb r10,r12,r4 /* Compare each byte against c byte. */
- cmpb r11,r12,r0 /* Compare each byte against null byte. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- srd r11,r11,r6
- sld r10,r10,r6
- sld r11,r11,r6
-#else
- sld r10,r10,r6
- sld r11,r11,r6
- srd r10,r10,r6
- srd r11,r11,r6
-#endif
- or r5,r10,r11 /* OR the results to speed things up. */
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a doubleword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r9,16(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- cmpb r6,r9,r4
- cmpb r7,r9,r0
- or r12,r10,r11
- or r9,r6,r7
- or r5,r12,r9
- cmpdi cr7,r5,0
- beq cr7,L(loop)
-
- /* OK, one (or both) of the doublewords contains a c/null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c/null byte. */
-
- cmpdi cr6,r12,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The c/null byte must be in the second doubleword. Adjust the
- address again and move the result of cmpb to r10 so we can calculate
- the pointer. */
-
- mr r10,r6
- mr r11,r7
- addi r8,r8,8
-
- /* r10/r11 have the output of the cmpb instructions, that is,
- 0xff in the same position as the c/null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r3,r10,-1
- andc r3,r3,r10
- popcntd r0,r3
- addi r4,r11,-1
- andc r4,r4,r11
- cmpld cr7,r3,r4
- bgt cr7,L(no_match)
-#else
- cntlzd r0,r10 /* Count leading zeros before c matches. */
- cmpld cr7,r11,r10
- bgt cr7,L(no_match)
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching c byte
- or null in case c was not found. */
- blr
-
- .align 4
-L(no_match):
- li r3,0
- blr
-
-/* We are here because strchr was called with a null byte. */
- .align 4
-L(null_match):
- /* r0 has a doubleword of null bytes. */
-
- cmpb r5,r12,r0 /* Compare each byte against null bytes. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r5,r5,r6
- sld r5,r5,r6
-#else
- sld r5,r5,r6
- srd r5,r5,r6
-#endif
- cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done_null)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop_null)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r5,r12,r0
- cmpdi cr7,r5,0
- bne cr7,L(done_null)
- b L(loop_null) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop_null):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r5,r12,r0
- cmpb r10,r11,r0
- or r6,r5,r10
- cmpdi cr7,r6,0
- beq cr7,L(loop_null)
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done_null)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
-
- mr r5,r10
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done_null):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching null byte. */
- blr
-END (STRCHR)
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
deleted file mode 100644
index 27bc1f0682..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRCHRNUL
-# define STRCHRNUL __strchrnul
-#endif
-/* int [r3] strchrnul (char *s [r3], int c [r4]) */
- .machine power7
-ENTRY (STRCHRNUL)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- ld r12,0(r8) /* Load doubleword from memory. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- /* Now r4 has a doubleword of c bytes and r0 has
- a doubleword of null bytes. */
-
- cmpb r10,r12,r0 /* Compare each byte against c byte. */
- cmpb r9,r12,r4 /* Compare each byte against null byte. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and to bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- srd r9,r9,r6
- sld r10,r10,r6
- sld r9,r9,r6
-#else
- sld r10,r10,r6
- sld r9,r9,r6
- srd r10,r10,r6
- srd r9,r9,r6
-#endif
- or r5,r9,r10 /* OR the results to speed things up. */
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle DWORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r0
- cmpb r9,r12,r4
- or r5,r9,r10
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r10,r12,r0
- cmpb r9,r12,r4
- cmpb r6,r11,r0
- cmpb r7,r11,r4
- or r5,r9,r10
- or r10,r6,r7
- or r11,r5,r10
- cmpdi cr7,r11,0
- beq cr7,L(loop)
-
- /* OK, one (or both) of the doublewords contains a c/null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c/null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The c/null byte must be in the second doubleword. Adjust the
- address again and move the result of cmpb to r5 so we can calculate
- the pointer. */
- mr r5,r10
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the c/null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert leading zeros to bytes. */
- add r3,r8,r0 /* Return address of matching c/null byte. */
- blr
-END (STRCHRNUL)
-weak_alias (STRCHRNUL, strchrnul)
-libc_hidden_builtin_def (STRCHRNUL)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
deleted file mode 100644
index 14e14f457e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* The optimization is achieved here through cmpb instruction.
- 8byte aligned strings are processed with double word comparision
- and unaligned strings are handled effectively with loop unrolling
- technique */
-
-#include <sysdep.h>
-
-#ifndef STRCMP
-# define STRCMP strcmp
-#endif
-
-/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
-
- .machine power7
-EALIGN (STRCMP, 4, 0)
- CALL_MCOUNT 2
-
- or r9, r3, r4
- rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */
- bne cr0, L(process_unaligned_bytes)
- li r5, 0
-
- .align 4
-/* process input parameters on double word aligned boundary */
-L(unrollDword):
- ld r8,0(r3)
- ld r10,0(r4)
- cmpb r7,r8,r5
- cmpdi cr7,r7,0
- mr r9,r7
- bne cr7,L(null_found)
- cmpld cr7,r8,r10
- bne cr7,L(different)
-
- ld r8,8(r3)
- ld r10,8(r4)
- cmpb r7,r8,r5
- cmpdi cr7,r7,0
- mr r9,r7
- bne cr7,L(null_found)
- cmpld cr7,r8,r10
- bne cr7,L(different)
-
- ld r8,16(r3)
- ld r10,16(r4)
- cmpb r7,r8,r5
- cmpdi cr7,r7,0
- mr r9,r7
- bne cr7,L(null_found)
- cmpld cr7,r8,r10
- bne cr7,L(different)
-
- ld r8,24(r3)
- ld r10,24(r4)
- cmpb r7,r8,r5
- cmpdi cr7,r7,0
- mr r9,r7
- bne cr7,L(null_found)
- cmpld cr7,r8,r10
- bne cr7,L(different)
-
- addi r3, r3, 32
- addi r4, r4, 32
- beq cr7, L(unrollDword)
-
- .align 4
-L(null_found):
-#ifdef __LITTLE_ENDIAN__
- neg r7,r9
- and r9,r9,r7
- li r7,-1
- cntlzd r9,r9
- subfic r9,r9,71
- sld r9,r7,r9
-#else
- cntlzd r9,r9
- li r7,-1
- addi r9,r9,8
- srd r9,r7,r9
-#endif
- or r8,r8,r9
- or r10,r10,r9
-
-L(different):
- cmpb r9,r8,r10
-#ifdef __LITTLE_ENDIAN__
- addi r7,r9,1
- andc r9,r7,r9
- cntlzd r9,r9
- subfic r9,r9,63
-#else
- not r9,r9
- cntlzd r9,r9
- subfic r9,r9,56
-#endif
- srd r3,r8,r9
- srd r10,r10,r9
- rldicl r10,r10,0,56
- rldicl r3,r3,0,56
- subf r3,r10,r3
- blr
-
- .align 4
-L(process_unaligned_bytes):
- lbz r9, 0(r3) /* load byte from s1 */
- lbz r10, 0(r4) /* load byte from s2 */
- cmpdi cr7, r9, 0 /* compare *s1 with NULL */
- beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
- cmplw cr7, r9, r10 /* compare *s1 and *s2 */
- bne cr7, L(ComputeDiff) /* branch to compute difference and return */
-
- lbz r9, 1(r3) /* load next byte from s1 */
- lbz r10, 1(r4) /* load next byte from s2 */
- cmpdi cr7, r9, 0 /* compare *s1 with NULL */
- beq cr7, L(diffOfNULL) /* if *s1 is NULL , return *s1 - *s2 */
- cmplw cr7, r9, r10 /* compare *s1 and *s2 */
- bne cr7, L(ComputeDiff) /* branch to compute difference and return */
-
- lbz r9, 2(r3) /* unroll 3rd byte here */
- lbz r10, 2(r4)
- cmpdi cr7, r9, 0
- beq cr7, L(diffOfNULL)
- cmplw cr7, r9, r10
- bne 7, L(ComputeDiff)
-
- lbz r9, 3(r3) /* unroll 4th byte now */
- lbz r10, 3(r4)
- addi r3, r3, 4 /* increment s1 by unroll factor */
- cmpdi cr7, r9, 0
- cmplw cr6, 9, r10
- beq cr7, L(diffOfNULL)
- addi r4, r4, 4 /* increment s2 by unroll factor */
- beq cr6, L(process_unaligned_bytes) /* unroll byte processing */
-
- .align 4
-L(ComputeDiff):
- extsw r9, r9
- subf r10, r10, r9 /* compute s1 - s2 */
- extsw r3, r10
- blr /* return */
-
- .align 4
-L(diffOfNULL):
- li r9, 0
- subf r10, r10, r9 /* compute s1 - s2 */
- extsw r3, r10 /* sign extend result */
- blr /* return */
-
-END (STRCMP)
-libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S
deleted file mode 100644
index 63848c460c..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] strlen (char *s [r3]) */
-
-#ifndef STRLEN
-# define STRLEN strlen
-#endif
- .machine power7
-ENTRY (STRLEN)
- CALL_MCOUNT 1
- dcbt 0,r3
- clrrdi r4,r3,3 /* Align the address to doubleword boundary. */
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
- li r5,-1 /* MASK = 0xffffffffffffffff. */
- ld r12,0(r4) /* Load doubleword from memory. */
-#ifdef __LITTLE_ENDIAN__
- sld r5,r5,r6
-#else
- srd r5,r5,r6 /* MASK = MASK >> padding. */
-#endif
- orc r9,r12,r5 /* Mask bits that are not part of the string. */
- cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
- cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- mtcrf 0x01,r4
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle DWORD2 of pair. */
- ldu r12,8(r4)
- cmpb r10,r12,r0
- cmpdi cr7,r10,0
- bne cr7,L(done)
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
-
- ld r12, 8(r4)
- ldu r11, 16(r4)
- cmpb r10,r12,r0
- cmpb r9,r11,r0
- or r8,r9,r10 /* Merge everything in one doubleword. */
- cmpdi cr7,r8,0
- beq cr7,L(loop)
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r10,0
- addi r4,r4,-8
- bne cr6,L(done)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- length. */
-
- mr r10,r9
- addi r4,r4,8
-
- /* r10 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the length. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r9, r10, -1 /* Form a mask from trailing zeros. */
- andc r9, r9, r10
- popcntd r0, r9 /* Count the bits in the mask. */
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- subf r5,r3,r4
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r5,r0 /* Compute final length. */
- blr
-END (STRLEN)
-libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
deleted file mode 100644
index d53b31be8e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strncmp.S
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Optimized strcmp implementation for POWER7/PowerPC64.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRNCMP
-# define STRNCMP strncmp
-#endif
-
-/* See strlen.s for comments on how the end-of-string testing works. */
-
-/* int [r3] strncmp (const char *s1 [r3],
- const char *s2 [r4],
- size_t size [r5]) */
-
-EALIGN (STRNCMP,5,0)
- CALL_MCOUNT 3
-
-#define rTMP2 r0
-#define rRTN r3
-#define rSTR1 r3 /* first string arg */
-#define rSTR2 r4 /* second string arg */
-#define rN r5 /* max string length */
-#define rWORD1 r6 /* current word in s1 */
-#define rWORD2 r7 /* current word in s2 */
-#define rWORD3 r10
-#define rWORD4 r11
-#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
-#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */
-#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
-#define rBITDIF r11 /* bits that differ in s1 & s2 words */
-#define rTMP r12
-
- dcbt 0,rSTR1
- nop
- or rTMP,rSTR2,rSTR1
- lis r7F7F,0x7f7f
- dcbt 0,rSTR2
- nop
- clrldi. rTMP,rTMP,61
- cmpldi cr1,rN,0
- lis rFEFE,-0x101
- bne L(unaligned)
-/* We are doubleword aligned so set up for two loops. first a double word
- loop, then fall into the byte loop if any residual. */
- srdi. rTMP,rN,3
- clrldi rN,rN,61
- addi rFEFE,rFEFE,-0x101
- addi r7F7F,r7F7F,0x7f7f
- cmpldi cr1,rN,0
- beq L(unaligned)
-
- mtctr rTMP
- ld rWORD1,0(rSTR1)
- ld rWORD2,0(rSTR2)
- sldi rTMP,rFEFE,32
- insrdi r7F7F,r7F7F,32,0
- add rFEFE,rFEFE,rTMP
- b L(g1)
-
-L(g0):
- ldu rWORD1,8(rSTR1)
- bne cr1,L(different)
- ldu rWORD2,8(rSTR2)
-L(g1): add rTMP,rFEFE,rWORD1
- nor rNEG,r7F7F,rWORD1
- bdz L(tail)
- and. rTMP,rTMP,rNEG
- cmpd cr1,rWORD1,rWORD2
- beq L(g0)
-
-/* OK. We've hit the end of the string. We need to be careful that
- we don't compare two strings as different because of gunk beyond
- the end of the strings... */
-
-#ifdef __LITTLE_ENDIAN__
-L(endstring):
- addi rTMP2, rTMP, -1
- beq cr1, L(equal)
- andc rTMP2, rTMP2, rTMP
- rldimi rTMP2, rTMP2, 1, 0
- and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
- and rWORD1, rWORD1, rTMP2
- cmpd cr1, rWORD1, rWORD2
- beq cr1, L(equal)
- cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */
- addi rNEG, rBITDIF, 1
- orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */
- sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */
- andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */
- andc rWORD2, rWORD2, rNEG
- xor. rBITDIF, rWORD1, rWORD2
- sub rRTN, rWORD1, rWORD2
- blt L(highbit)
- sradi rRTN, rRTN, 63 /* must return an int. */
- ori rRTN, rRTN, 1
- blr
-L(equal):
- li rRTN, 0
- blr
-
-L(different):
- ld rWORD1, -8(rSTR1)
- cmpb rBITDIF, rWORD1, rWORD2 /* 0xff on equal bytes. */
- addi rNEG, rBITDIF, 1
- orc rNEG, rNEG, rBITDIF /* 0's below LS differing byte. */
- sldi rNEG, rNEG, 8 /* 1's above LS differing byte. */
- andc rWORD1, rWORD1, rNEG /* mask off MS bytes. */
- andc rWORD2, rWORD2, rNEG
- xor. rBITDIF, rWORD1, rWORD2
- sub rRTN, rWORD1, rWORD2
- blt L(highbit)
- sradi rRTN, rRTN, 63
- ori rRTN, rRTN, 1
- blr
-L(highbit):
- sradi rRTN, rWORD2, 63
- ori rRTN, rRTN, 1
- blr
-
-#else
-L(endstring):
- and rTMP,r7F7F,rWORD1
- beq cr1,L(equal)
- add rTMP,rTMP,r7F7F
- xor. rBITDIF,rWORD1,rWORD2
- andc rNEG,rNEG,rTMP
- blt L(highbit)
- cntlzd rBITDIF,rBITDIF
- cntlzd rNEG,rNEG
- addi rNEG,rNEG,7
- cmpd cr1,rNEG,rBITDIF
- sub rRTN,rWORD1,rWORD2
- blt cr1,L(equal)
- sradi rRTN,rRTN,63 /* must return an int. */
- ori rRTN,rRTN,1
- blr
-L(equal):
- li rRTN,0
- blr
-
-L(different):
- ld rWORD1,-8(rSTR1)
- xor. rBITDIF,rWORD1,rWORD2
- sub rRTN,rWORD1,rWORD2
- blt L(highbit)
- sradi rRTN,rRTN,63
- ori rRTN,rRTN,1
- blr
-L(highbit):
- sradi rRTN,rWORD2,63
- ori rRTN,rRTN,1
- blr
-#endif
-
-/* Oh well. In this case, we just do a byte-by-byte comparison. */
- .align 4
-L(tail):
- and. rTMP,rTMP,rNEG
- cmpd cr1,rWORD1,rWORD2
- bne L(endstring)
- addi rSTR1,rSTR1,8
- bne cr1,L(different)
- addi rSTR2,rSTR2,8
- cmpldi cr1,rN,0
-L(unaligned):
- mtctr rN
- ble cr1,L(ux)
-L(uz):
- lbz rWORD1,0(rSTR1)
- lbz rWORD2,0(rSTR2)
- .align 4
-L(u1):
- cmpdi cr1,rWORD1,0
- bdz L(u4)
- cmpd rWORD1,rWORD2
- beq cr1,L(u4)
- bne L(u4)
- lbzu rWORD3,1(rSTR1)
- lbzu rWORD4,1(rSTR2)
- cmpdi cr1,rWORD3,0
- bdz L(u3)
- cmpd rWORD3,rWORD4
- beq cr1,L(u3)
- bne L(u3)
- lbzu rWORD1,1(rSTR1)
- lbzu rWORD2,1(rSTR2)
- cmpdi cr1,rWORD1,0
- bdz L(u4)
- cmpd rWORD1,rWORD2
- beq cr1,L(u4)
- bne L(u4)
- lbzu rWORD3,1(rSTR1)
- lbzu rWORD4,1(rSTR2)
- cmpdi cr1,rWORD3,0
- bdz L(u3)
- cmpd rWORD3,rWORD4
- beq cr1,L(u3)
- bne L(u3)
- lbzu rWORD1,1(rSTR1)
- lbzu rWORD2,1(rSTR2)
- b L(u1)
-
-L(u3): sub rRTN,rWORD3,rWORD4
- blr
-L(u4): sub rRTN,rWORD1,rWORD2
- blr
-L(ux):
- li rRTN,0
- blr
-END (STRNCMP)
-libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
deleted file mode 100644
index 0224f74898..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strncpy.S
+++ /dev/null
@@ -1,722 +0,0 @@
-/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Implements the functions
-
- char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
-
- AND
-
- char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
-
- The algorithm is as follows:
- > if src and dest are 8 byte aligned, perform double word copy
- else
- > copy byte by byte on unaligned addresses.
-
- The aligned comparison are made using cmpb instructions. */
-
-/* The focus on optimization for performance improvements are as follows:
- 1. data alignment [gain from aligned memory access on read/write]
- 2. POWER7 gains performance with loop unrolling/unwinding
- [gain by reduction of branch penalty].
- 3. The final pad with null bytes is done by calling an optimized
- memset. */
-
-#ifdef USE_AS_STPNCPY
-# ifndef STPNCPY
-# define FUNC_NAME __stpncpy
-# else
-# define FUNC_NAME STPNCPY
-# endif
-#else
-# ifndef STRNCPY
-# define FUNC_NAME strncpy
-# else
-# define FUNC_NAME STRNCPY
-# endif
-#endif /* !USE_AS_STPNCPY */
-
-#define FRAMESIZE (FRAME_MIN_SIZE+32)
-
-#ifndef MEMSET
-/* For builds with no IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define MEMSET __GI_memset
-# else
-# define MEMSET memset
-# endif
-#endif
-
- .machine power7
-EALIGN(FUNC_NAME, 4, 0)
- CALL_MCOUNT 3
-
- mflr r0 /* load link register LR to r0 */
- or r10, r3, r4 /* to verify source and destination */
- rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
-
- std r19, -8(r1) /* save callers register , r19 */
- std r18, -16(r1) /* save callers register , r18 */
- std r0, 16(r1) /* store the link register */
- stdu r1, -FRAMESIZE(r1) /* create the stack frame */
-
- mr r9, r3 /* save r3 into r9 for use */
- mr r18, r3 /* save r3 for retCode of strncpy */
- bne 0, L(unaligned)
-
-L(aligned):
- srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
- cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
- ble 7, L(update1)
-
- ld r10, 0(r4) /* load doubleWord from src */
- cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
- cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
- bne cr7, L(update3)
-
- std r10, 0(r3) /* copy doubleword at offset=0 */
- ld r10, 8(r4) /* load next doubleword from offset=8 */
- cmpb r8, r10, r8 /* compare src with NULL , we read just now */
- cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
- bne 7,L(HopBy8)
-
- addi r8, r11, -4
- mr r7, r3
- srdi r8, r8, 2
- mr r6, r4
- addi r8, r8, 1
- li r12, 0
- mtctr r8
- b L(dwordCopy)
-
- .p2align 4
-L(dWordUnroll):
- std r8, 16(r9)
- ld r8, 24(r4) /* load dword,perform loop unrolling again */
- cmpb r10, r8, r10
- cmpdi cr7, r10, 0
- bne cr7, L(HopBy24)
-
- std r8, 24(r7) /* copy dword at offset=24 */
- addi r9, r9, 32
- addi r4, r4, 32
- bdz L(leftDwords) /* continue with loop on counter */
-
- ld r3, 32(r6)
- cmpb r8, r3, r10
- cmpdi cr7, r8, 0
- bne cr7, L(update2)
-
- std r3, 32(r7)
- ld r10, 40(r6)
- cmpb r8, r10, r8
- cmpdi cr7, r8, 0
- bne cr7, L(HopBy40)
-
- mr r6, r4 /* update values */
- mr r7, r9
- mr r11, r0
- mr r5, r19
-
-L(dwordCopy):
- std r10, 8(r9) /* copy dword at offset=8 */
- addi r19, r5, -32
- addi r0, r11, -4
- ld r8, 16(r4)
- cmpb r10, r8, r12
- cmpdi cr7, r10, 0
- beq cr7, L(dWordUnroll)
-
- addi r9, r9, 16 /* increment dst by 16 */
- addi r4, r4, 16 /* increment src by 16 */
- addi r5, r5, -16 /* decrement length 'n' by 16 */
- addi r0, r11, -2 /* decrement loop counter */
-
-L(dWordUnrollOFF):
- ld r10, 0(r4) /* load first dword */
- li r8, 0 /* load mask */
- cmpb r8, r10, r8
- cmpdi cr7, r8, 0
- bne cr7, L(byte_by_byte)
- mtctr r0
- li r7, 0
- b L(CopyDword)
-
- .p2align 4
-L(loadDWordandCompare):
- ld r10, 0(r4)
- cmpb r8, r10, r7
- cmpdi cr7, r8, 0
- bne cr7, L(byte_by_byte)
-
-L(CopyDword):
- addi r9, r9, 8
- std r10, -8(r9)
- addi r4, r4, 8
- addi r5, r5, -8
- bdnz L(loadDWordandCompare)
-
-L(byte_by_byte):
- cmpldi cr7, r5, 3
- ble cr7, L(verifyByte)
- srdi r10, r5, 2
- mr r19, r9
- mtctr r10
- b L(firstByteUnroll)
-
- .p2align 4
-L(bytes_unroll):
- lbz r10, 1(r4) /* load byte from src */
- cmpdi cr7, r10, 0 /* compare for NULL */
- stb r10, 1(r19) /* store byte to dst */
- beq cr7, L(updtDestComputeN2ndByte)
-
- addi r4, r4, 4 /* advance src */
-
- lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
- cmpdi cr7, r10, 0
- stb r10, 2(r19)
- beq cr7, L(updtDestComputeN3rdByte)
-
- lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
- addi r19, r19, 4
- cmpdi cr7, r10, 0
- stb r10, -1(r19)
- beq cr7, L(ComputeNByte)
-
- bdz L(update0)
-
-L(firstByteUnroll):
- lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
- cmpdi cr7, 10, 0
- stb r10, 0(r19)
- bne cr7, L(bytes_unroll)
- addi r19, r19, 1
-
-L(ComputeNByte):
- subf r9, r19, r9 /* compute 'n'n bytes to fill */
- add r8, r9, r5
-
-L(zeroFill):
- cmpdi cr7, r8, 0 /* compare if length is zero */
- beq cr7, L(update3return)
-
- mr r3, r19 /* fill buffer with */
- li r4, 0 /* zero fill buffer */
- mr r5, r8 /* how many bytes to fill buffer with */
- bl MEMSET /* call optimized memset */
- nop
-
-L(update3return):
-#ifdef USE_AS_STPNCPY
- addi r3, r19, -1 /* update return value */
-#endif
-
-L(hop2return):
-#ifndef USE_AS_STPNCPY
- mr r3, r18 /* set return value */
-#endif
- addi r1, r1, FRAMESIZE /* restore stack pointer */
- ld r0, 16(r1) /* read the saved link register */
- ld r18, -16(r1) /* restore callers save register, r18 */
- ld r19, -8(r1) /* restore callers save register, r19 */
- mtlr r0 /* branch to link register */
- blr /* return */
-
- .p2align 4
-L(update0):
- mr r9, r19
-
- .p2align 4
-L(verifyByte):
- rldicl. r8, r5, 0, 62
-#ifdef USE_AS_STPNCPY
- mr r3, r9
-#endif
- beq cr0, L(hop2return)
- mtctr r8
- addi r4, r4, -1
- mr r19, r9
- b L(oneBYone)
-
- .p2align 4
-L(proceed):
- bdz L(done)
-
-L(oneBYone):
- lbzu r10, 1(r4) /* copy byte */
- addi r19, r19, 1
- addi r8, r8, -1
- cmpdi cr7, r10, 0
- stb r10, -1(r19)
- bne cr7, L(proceed)
- b L(zeroFill)
-
- .p2align 4
-L(done):
- addi r1, r1, FRAMESIZE /* restore stack pointer */
-#ifdef USE_AS_STPNCPY
- mr r3, r19 /* set the return value */
-#else
- mr r3, r18 /* set the return value */
-#endif
- ld r0, 16(r1) /* read the saved link register */
- ld r18, -16(r1) /* restore callers save register, r18 */
- ld r19, -8(r1) /* restore callers save register, r19 */
- mtlr r0 /* branch to link register */
- blr /* return */
-
-L(update1):
- mr r0, r11
- mr r19, r5
-
- .p2align 4
-L(leftDwords):
- cmpdi cr7, r0, 0
- mr r5, r19
- bne cr7, L(dWordUnrollOFF)
- b L(byte_by_byte)
-
- .p2align 4
-L(updtDestComputeN2ndByte):
- addi r19, r19, 2 /* update dst by 2 */
- subf r9, r19, r9 /* compute distance covered */
- add r8, r9, r5
- b L(zeroFill)
-
- .p2align 4
-L(updtDestComputeN3rdByte):
- addi r19, r19, 3 /* update dst by 3 */
- subf r9, r19, r9 /* compute distance covered */
- add r8, r9, r5
- b L(zeroFill)
-
- .p2align 4
-L(HopBy24):
- addi r9, r9, 24 /* increment dst by 24 */
- addi r4, r4, 24 /* increment src by 24 */
- addi r5, r5, -24 /* decrement length 'n' by 24 */
- addi r0, r11, -3 /* decrement loop counter */
- b L(dWordUnrollOFF)
-
- .p2align 4
-L(update2):
- mr r5, r19
- b L(dWordUnrollOFF)
-
- .p2align 4
-L(HopBy40):
- addi r9, r7, 40 /* increment dst by 40 */
- addi r4, r6, 40 /* increment src by 40 */
- addi r5, r5, -40 /* decrement length 'n' by 40 */
- addi r0, r11, -5 /* decrement loop counter */
- b L(dWordUnrollOFF)
-
-L(update3):
- mr r0, r11
- b L(dWordUnrollOFF)
-
-L(HopBy8):
- addi r9, r3, 8 /* increment dst by 8 */
- addi r4, r4, 8 /* increment src by 8 */
- addi r5, r5, -8 /* decrement length 'n' by 8 */
- addi r0, r11, -1 /* decrement loop counter */
- b L(dWordUnrollOFF)
-
-L(unaligned):
- cmpdi r5, 16 /* Proceed byte by byte for less than 16 */
- ble L(byte_by_byte)
- rldicl r7, r3, 0, 61
- rldicl r6, r4, 0, 61
- cmpdi r6, 0 /* Check src alignment */
- beq L(srcaligndstunalign)
- /* src is unaligned */
- rlwinm r10, r4, 3,26,28 /* Calculate padding. */
- clrrdi r4, r4, 3 /* Align the addr to dw boundary */
- ld r8, 0(r4) /* Load doubleword from memory. */
- li r0, 0
- /* Discard bits not part of the string */
-#ifdef __LITTLE_ENDIAN__
- srd r7, r8, r10
-#else
- sld r7, r8, r10
-#endif
- cmpb r0, r7, r0 /* Compare each byte against null */
- /* Discard bits not part of the string */
-#ifdef __LITTLE_ENDIAN__
- sld r0, r0, r10
-#else
- srd r0, r0, r10
-#endif
- cmpdi r0, 0
- bne L(bytebybyte) /* if it has null, copy byte by byte */
- subfic r6, r6, 8
- rlwinm r12, r3, 3,26,28 /* Calculate padding in bits. */
- rldicl r9, r3, 0, 61 /* Calculate padding in bytes. */
- addi r3, r3, -1
-
- cmpdi r12, 0 /* check dest alignment */
- beq L(srcunaligndstalign)
-
- /* both src and dst unaligned */
-#ifdef __LITTLE_ENDIAN__
- sld r8, r7, r10
- mr r11, r10
- addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
-#else
- srd r8, r7, r10
- subfic r11, r10, 64
-#endif
- /* dst alignment is greater then src alignment? */
- cmpd cr7, r12, r10
- ble cr7, L(dst_align_small)
- /* src alignment is less than dst */
-
- /* Calculate the dst alignment difference */
- subfic r7, r9, 8
- mtctr r7
-
- /* Write until dst is aligned */
- cmpdi r0, r7, 4
- blt L(storebyte1) /* less than 4, store byte by byte */
- beq L(equal1) /* if its 4, store word */
- addi r0, r7, -4 /* greater than 4, so stb and stw */
- mtctr r0
-L(storebyte1):
-#ifdef __LITTLE_ENDIAN__
- addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
-#else
- addi r11, r11, -8
-#endif
- srd r7, r8, r11
- stbu r7, 1(r3)
- addi r5, r5, -1
- bdnz L(storebyte1)
-
- subfic r7, r9, 8 /* Check the remaining bytes */
- cmpdi r0, r7, 4
- blt L(proceed1)
-
- .align 4
-L(equal1):
-#ifdef __LITTLE_ENDIAN__
- addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
- srd r7, r8, r11
-#else
- subfic r11, r11, 64
- sld r7, r8, r11
- srdi r7, r7, 32
-#endif
- stw r7, 1(r3)
- addi r3, r3, 4
- addi r5, r5, -4
-
-L(proceed1):
- mr r7, r8
- /* calculate the Left over bytes to be written */
- subfic r11, r10, 64
- subfic r12, r12, 64
- subf r12, r12, r11 /* remaining bytes on second dw */
- subfic r10, r12, 64 /* remaining bytes on first dw */
- subfic r9, r9, 8
- subf r6, r9, r6 /* recalculate padding */
-L(srcunaligndstalign):
- addi r3, r3, 1
- subfic r12, r10, 64 /* remaining bytes on second dw */
- addi r4, r4, 8
- li r0,0
- b L(storedouble)
-
- .align 4
-L(dst_align_small):
- mtctr r6
- /* Write until src is aligned */
-L(storebyte2):
-#ifdef __LITTLE_ENDIAN__
- addi r11, r11, 8 /* Adjust byte pointer on dw */
-#else
- addi r11, r11, -8
-#endif
- srd r7, r8, r11
- stbu r7, 1(r3)
- addi r5, r5, -1
- bdnz L(storebyte2)
-
- addi r4, r4, 8 /* Increment src pointer */
- addi r3, r3, 1 /* Increment dst pointer */
- mr r9, r3
- li r8, 0
- cmpd cr7, r12, r10
- beq cr7, L(aligned)
- rldicl r6, r3, 0, 61 /* Recalculate padding */
- mr r7, r6
-
- /* src is algined */
-L(srcaligndstunalign):
- mr r9, r3
- mr r6, r7
- ld r8, 0(r4)
- subfic r10, r7, 8
- mr r7, r8
- li r0, 0 /* Check null */
- cmpb r0, r8, r0
- cmpdi r0, 0
- bne L(byte_by_byte) /* Do byte by byte if there is NULL */
- rlwinm r12, r3, 3,26,28 /* Calculate padding */
- addi r3, r3, -1
- /* write byte by byte until aligned */
-#ifdef __LITTLE_ENDIAN__
- li r11, -8
-#else
- li r11, 64
-#endif
- mtctr r10
- cmpdi r0, r10, 4
- blt L(storebyte)
- beq L(equal)
- addi r0, r10, -4
- mtctr r0
-L(storebyte):
-#ifdef __LITTLE_ENDIAN__
- addi r11, r11, 8 /* Adjust byte pointer on dw */
-#else
- addi r11, r11, -8
-#endif
- srd r7, r8, r11
- stbu r7, 1(r3)
- addi r5, r5, -1
- bdnz L(storebyte)
-
- cmpdi r0, r10, 4
- blt L(align)
-
- .align 4
-L(equal):
-#ifdef __LITTLE_ENDIAN__
- addi r11, r11, 8
- srd r7, r8, r11
-#else
- subfic r11, r11, 64
- sld r7, r8, r11
- srdi r7, r7, 32
-#endif
- stw r7, 1(r3)
- addi r5, r5, -4
- addi r3, r3, 4
-L(align):
- addi r3, r3, 1
- addi r4, r4, 8 /* Increment src pointer */
- subfic r10, r12, 64
- li r0, 0
- /* dst addr aligned to 8 */
-L(storedouble):
- cmpdi r5, 8
- ble L(null1)
- ld r7, 0(r4) /* load next dw */
- cmpb r0, r7, r0
- cmpdi r0, 0 /* check for null on each new dw */
- bne L(null)
-#ifdef __LITTLE_ENDIAN__
- srd r9, r8, r10 /* bytes from first dw */
- sld r11, r7, r12 /* bytes from second dw */
-#else
- sld r9, r8, r10
- srd r11, r7, r12
-#endif
- or r11, r9, r11 /* make as a single dw */
- std r11, 0(r3) /* store as std on aligned addr */
- mr r8, r7 /* still few bytes left to be written */
- addi r3, r3, 8 /* increment dst addr */
- addi r4, r4, 8 /* increment src addr */
- addi r5, r5, -8
- b L(storedouble) /* Loop until NULL */
-
- .align 4
-
-/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(null):
- addi r3, r3, -1
- mr r10, r12
- mtctr r6
-#ifdef __LITTLE_ENDIAN__
- subfic r10, r10, 64
- addi r10, r10, -8
-#endif
- cmpdi r0, r5, 4
- blt L(loop)
- cmpdi r0, r6, 4
- blt L(loop)
-
- /* we can still use stw if leftover >= 4 */
-#ifdef __LITTLE_ENDIAN__
- addi r10, r10, 8
- srd r11, r8, r10
-#else
- subfic r10, r10, 64
- sld r11, r8, r10
- srdi r11, r11, 32
-#endif
- stw r11, 1(r3)
- addi r5, r5, -4
- addi r3, r3, 4
- cmpdi r0, r5, 0
- beq L(g1)
- cmpdi r0, r6, 4
- beq L(bytebybyte1)
- addi r10, r10, 32
-#ifdef __LITTLE_ENDIAN__
- addi r10, r10, -8
-#else
- subfic r10, r10, 64
-#endif
- addi r0, r6, -4
- mtctr r0
- /* remaining byte by byte part of first dw */
-L(loop):
-#ifdef __LITTLE_ENDIAN__
- addi r10, r10, 8
-#else
- addi r10, r10, -8
-#endif
- srd r0, r8, r10
- stbu r0, 1(r3)
- addi r5, r5, -1
- cmpdi r0, r5, 0
- beq L(g1)
- bdnz L(loop)
-L(bytebybyte1):
- addi r3, r3, 1
- /* remaining byte by byte part of second dw */
-L(bytebybyte):
- addi r3, r3, -8
- addi r4, r4, -1
-
-#ifdef __LITTLE_ENDIAN__
- extrdi. r0, r7, 8, 56
- stbu r7, 8(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 48
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 40
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 32
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 24
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 16
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 8
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi r0, r7, 8, 0
- stbu r0, 1(r3)
- addi r5, r5, -1
- b L(g2)
-#else
- extrdi. r0, r7, 8, 0
- stbu r0, 8(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 8
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 16
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 24
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 32
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 40
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- extrdi. r0, r7, 8, 48
- stbu r0, 1(r3)
- addi r5, r5, -1
- beq L(g2)
- cmpdi r5, 0
- beq L(g1)
- stbu r7, 1(r3)
- addi r5, r5, -1
- b L(g2)
-#endif
-L(g1):
-#ifdef USE_AS_STPNCPY
- addi r3, r3, 1
-#endif
-L(g2):
- addi r3, r3, 1
- mr r19, r3
- mr r8, r5
- b L(zeroFill)
-L(null1):
- mr r9, r3
- subf r4, r6, r4
- b L(byte_by_byte)
-END(FUNC_NAME)
-#ifndef USE_AS_STPNCPY
-libc_hidden_builtin_def (strncpy)
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S
deleted file mode 100644
index a970b6ce30..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strnlen.S
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2010-2017 Free Software Foundation, Inc.
- Contributed by Luis Machado <luisgpm@br.ibm.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRNLEN
-# define STRNLEN __strnlen
-#endif
-
-/* int [r3] strnlen (char *s [r3], int size [r4]) */
- .machine power7
-ENTRY (STRNLEN)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3
- add r7,r3,r4 /* Calculate the last acceptable address. */
- cmpldi r4,32
- li r0,0 /* Doubleword with null chars. */
- addi r7,r7,-1
-
- /* If we have less than 33 bytes to search, skip to a faster code. */
- ble L(small_range)
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- ld r12,0(r8) /* Load doubleword from memory. */
- cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- sld r10,r10,r6
-#else
- sld r10,r10,r6
- srd r10,r10,r6
-#endif
- cmpldi cr7,r10,0 /* If r10 == 0, no null's have been found. */
- bne cr7,L(done)
-
- clrrdi r7,r7,3 /* Address of last doubleword. */
- mtcrf 0x01,r8
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop_setup)
-
- /* Handle DWORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r0
- cmpldi cr7,r10,0
- bne cr7,L(done)
-
-L(loop_setup):
- /* The last dword we want to read in the loop below is the one
- containing the last byte of the string, ie. the dword at
- (s + size - 1) & ~7, or r7. The first dword read is at
- r8 + 8, we read 2 * cnt dwords, so the last dword read will
- be at r8 + 8 + 16 * cnt - 8. Solving for cnt gives
- cnt = (r7 - r8) / 16 */
- sub r5,r7,r8
- srdi r6,r5,4 /* Number of loop iterations. */
- mtctr r6 /* Setup the counter. */
-
- /* Main loop to look for the null byte in the string. Since
- it's a small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
-
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r10,r12,r0
- cmpb r9,r11,r0
- or r5,r9,r10 /* Merge everything in one doubleword. */
- cmpldi cr7,r5,0
- bne cr7,L(found)
- bdnz L(loop)
-
- /* We may have one more dword to read. */
- cmpld cr6,r8,r7
- beq cr6,L(end_max)
-
- ldu r12,8(r8)
- cmpb r10,r12,r0
- cmpldi cr6,r10,0
- bne cr6,L(done)
-
-L(end_max):
- mr r3,r4
- blr
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
- .align 4
-L(found):
- cmpldi cr6,r10,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- length. */
-
- mr r10,r9
- addi r8,r8,8
-
- /* r10 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the length.
- We need to make sure the null char is *before* the end of the
- range. */
-L(done):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r10,-1
- andc r0,r0,r10
- popcntd r0,r0
-#else
- cntlzd r0,r10 /* Count leading zeros before the match. */
-#endif
- sub r3,r8,r3
- srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
- add r3,r3,r0 /* Length until the match. */
- cmpld r3,r4
- blelr
- mr r3,r4
- blr
-
-/* Deals with size <= 32. */
- .align 4
-L(small_range):
- cmpldi r4,0
- beq L(end_max)
-
- clrrdi r7,r7,3 /* Address of last doubleword. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
- ld r12,0(r8) /* Load doubleword from memory. */
- cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- sld r10,r10,r6
-#else
- sld r10,r10,r6
- srd r10,r10,r6
-#endif
- cmpldi cr7,r10,0
- bne cr7,L(done)
-
- cmpld r8,r7
- beq L(end_max)
-
- .p2align 5
-L(loop_small):
- ldu r12,8(r8)
- cmpb r10,r12,r0
- cmpldi cr6,r10,0
- bne cr6,L(done)
- cmpld r8,r7
- bne L(loop_small)
- mr r3,r4
- blr
-
-END (STRNLEN)
-libc_hidden_def (__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strrchr.S b/sysdeps/powerpc/powerpc64/power7/strrchr.S
deleted file mode 100644
index c22393deb5..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strrchr.S
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
- Copyright (C) 2014-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* int [r3] strrchr (char *s [r3], int c [r4]) */
-
-#ifndef STRRCHR
-# define STRRCHR strrchr
-#endif
-
- .machine power7
-ENTRY (STRRCHR)
- CALL_MCOUNT 2
- dcbt 0,r3
- clrrdi r8,r3,3 /* Align the address to doubleword boundary. */
- cmpdi cr7,r4,0
- ld r12,0(r8) /* Load doubleword from memory. */
- li r9,0 /* used to store last occurence */
- li r0,0 /* Doubleword with null chars to use
- with cmpb. */
-
- rlwinm r6,r3,3,26,28 /* Calculate padding. */
-
- beq cr7,L(null_match)
-
- /* Replicate byte to doubleword. */
- insrdi r4,r4,8,48
- insrdi r4,r4,16,32
- insrdi r4,r4,32,0
-
- /* r4 is changed now ,if its passed as more chars
- check for null again */
- cmpdi cr7,r4,0
- beq cr7,L(null_match)
- /* Now r4 has a doubleword of c bytes and r0 has
- a doubleword of null bytes. */
-
- cmpb r10,r12,r4 /* Compare each byte against c byte. */
- cmpb r11,r12,r0 /* Compare each byte against null byte. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r10,r10,r6
- srd r11,r11,r6
- sld r10,r10,r6
- sld r11,r11,r6
-#else
- sld r10,r10,r6
- sld r11,r11,r6
- srd r10,r10,r6
- srd r11,r11,r6
-#endif
- or r5,r10,r11 /* OR the results to speed things up. */
- cmpdi cr7,r5,0 /* If r5 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done)
-
-L(align):
- mtcrf 0x01,r8
-
- /* Are we now aligned to a doubleword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- or r5,r10,r11
- cmpdi cr7,r5,0
- bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
- .p2align 5
-L(loop):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r7,16(r8)
- cmpb r10,r12,r4
- cmpb r11,r12,r0
- cmpb r6,r7,r4
- cmpb r7,r7,r0
- or r12,r10,r11
- or r5,r6,r7
- or r5,r12,r5
- cmpdi cr7,r5,0
- beq cr7,L(loop)
-
- /* OK, one (or both) of the doublewords contains a c/null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a c/null byte. */
- cmpdi cr6,r12,0
- addi r8,r8,-8
- bne cr6,L(done)
-
- /* The c/null byte must be in the second doubleword. Adjust the
- address again and move the result of cmpb to r10 so we can calculate
- the pointer. */
-
- mr r10,r6
- mr r11,r7
- addi r8,r8,8
-
- /* r10/r11 have the output of the cmpb instructions, that is,
- 0xff in the same position as the c/null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-
-L(done):
- /* if there are more than one 0xff in r11, find the first pos of ff
- in r11 and fill r10 with 0 from that position */
- cmpdi cr7,r11,0
- beq cr7,L(no_null)
-#ifdef __LITTLE_ENDIAN__
- addi r3,r11,-1
- andc r3,r3,r11
- popcntd r0,r3
-#else
- cntlzd r0,r11
-#endif
- subfic r0,r0,63
- li r6,-1
-#ifdef __LITTLE_ENDIAN__
- srd r0,r6,r0
-#else
- sld r0,r6,r0
-#endif
- and r10,r0,r10
-L(no_null):
-#ifdef __LITTLE_ENDIAN__
- cntlzd r0,r10 /* Count leading zeros before c matches. */
- addi r3,r10,-1
- andc r3,r3,r10
- addi r10,r11,-1
- andc r10,r10,r11
- cmpld cr7,r3,r10
- bgt cr7,L(no_match)
-#else
- addi r3,r10,-1 /* Count trailing zeros before c matches. */
- andc r3,r3,r10
- popcntd r0,r3
- cmpld cr7,r11,r10
- bgt cr7,L(no_match)
-#endif
- srdi r0,r0,3 /* Convert trailing zeros to bytes. */
- subfic r0,r0,7
- add r9,r8,r0 /* Return address of the matching c byte
- or null in case c was not found. */
- li r0,0
- cmpdi cr7,r11,0 /* If r11 == 0, no null's have been found. */
- beq cr7,L(align)
-
- .align 4
-L(no_match):
- mr r3,r9
- blr
-
-/* We are here because strrchr was called with a null byte. */
- .align 4
-L(null_match):
- /* r0 has a doubleword of null bytes. */
-
- cmpb r5,r12,r0 /* Compare each byte against null bytes. */
-
- /* Move the doublewords left and right to discard the bits that are
- not part of the string and bring them back as zeros. */
-#ifdef __LITTLE_ENDIAN__
- srd r5,r5,r6
- sld r5,r5,r6
-#else
- sld r5,r5,r6
- srd r5,r5,r6
-#endif
- cmpdi cr7,r5,0 /* If r10 == 0, no c or null bytes
- have been found. */
- bne cr7,L(done_null)
-
- mtcrf 0x01,r8
-
- /* Are we now aligned to a quadword boundary? If so, skip to
- the main loop. Otherwise, go through the alignment code. */
-
- bt 28,L(loop_null)
-
- /* Handle WORD2 of pair. */
- ldu r12,8(r8)
- cmpb r5,r12,r0
- cmpdi cr7,r5,0
- bne cr7,L(done_null)
- b L(loop_null) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
-
- /* Main loop to look for the end of the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
-L(loop_null):
- /* Load two doublewords, compare and merge in a
- single register for speed. This is an attempt
- to speed up the null-checking process for bigger strings. */
- ld r12,8(r8)
- ldu r11,16(r8)
- cmpb r5,r12,r0
- cmpb r10,r11,r0
- or r6,r5,r10
- cmpdi cr7,r6,0
- beq cr7,L(loop_null)
-
- /* OK, one (or both) of the doublewords contains a null byte. Check
- the first doubleword and decrement the address in case the first
- doubleword really contains a null byte. */
-
- cmpdi cr6,r5,0
- addi r8,r8,-8
- bne cr6,L(done_null)
-
- /* The null byte must be in the second doubleword. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
-
- mr r5,r10
- addi r8,r8,8
-
- /* r5 has the output of the cmpb instruction, that is, it contains
- 0xff in the same position as the null byte in the original
- doubleword from the string. Use that to calculate the pointer. */
-L(done_null):
-#ifdef __LITTLE_ENDIAN__
- addi r0,r5,-1
- andc r0,r0,r5
- popcntd r0,r0
-#else
- cntlzd r0,r5 /* Count leading zeros before the match. */
-#endif
- srdi r0,r0,3 /* Convert trailing zeros to bytes. */
- add r3,r8,r0 /* Return address of the matching null byte. */
- blr
-END (STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c b/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
deleted file mode 100644
index a917b2157e..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Optimized strstr implementation for PowerPC64/POWER7.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-
-#define STRSTR __strstr_ppc
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(__name)
-
-extern __typeof (strstr) __strstr_ppc attribute_hidden;
-
-#include <string/strstr.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S
deleted file mode 100644
index 260db2ed6d..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strstr.S
+++ /dev/null
@@ -1,521 +0,0 @@
-/* Optimized strstr implementation for PowerPC64/POWER7.
- Copyright (C) 2015-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-/* Char * [r3] strstr (char *s [r3], char * pat[r4]) */
-
-/* The performance gain is obtained using aligned memory access, load
- * doubleword and usage of cmpb instruction for quicker comparison. */
-
-#define ITERATIONS 64
-
-#ifndef STRSTR
-# define STRSTR strstr
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRLEN __GI_strlen
-# else
-# define STRLEN strlen
-# endif
-#endif
-
-#ifndef STRNLEN
-/* For builds with no IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRNLEN __GI_strnlen
-# else
-# define STRNLEN __strnlen
-# endif
-#endif
-
-#ifndef STRCHR
-# ifdef SHARED
-# define STRCHR __GI_strchr
-# else
-# define STRCHR strchr
-# endif
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+32)
- .machine power7
-EALIGN (STRSTR, 4, 0)
- CALL_MCOUNT 2
- mflr r0 /* Load link register LR to r0. */
- std r31, -8(r1) /* Save callers register r31. */
- std r30, -16(r1) /* Save callers register r30. */
- std r29, -24(r1) /* Save callers register r29. */
- std r28, -32(r1) /* Save callers register r28. */
- std r0, 16(r1) /* Store the link register. */
- cfi_offset(r31, -8)
- cfi_offset(r30, -16)
- cfi_offset(r28, -32)
- cfi_offset(r29, -24)
- cfi_offset(lr, 16)
- stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
- cfi_adjust_cfa_offset(FRAMESIZE)
-
- dcbt 0, r3
- dcbt 0, r4
- cmpdi cr7, r3, 0
- beq cr7, L(retnull)
- cmpdi cr7, r4, 0
- beq cr7, L(retnull)
-
- mr r29, r3
- mr r30, r4
- mr r3, r4
- bl STRLEN
- nop
-
- cmpdi cr7, r3, 0 /* If search str is null. */
- beq cr7, L(ret_r3)
-
- mr r31, r3
- mr r4, r3
- mr r3, r29
- bl STRNLEN
- nop
-
- cmpd cr7, r3, r31 /* If len(r3) < len(r4). */
- blt cr7, L(retnull)
- mr r3, r29
- lbz r4, 0(r30)
- bl STRCHR
- nop
-
- mr r11, r3
- /* If first char of search str is not present. */
- cmpdi cr7, r3, 0
- ble cr7, L(end)
- /* Reg r28 is used to count the number of iterations. */
- li r28, 0
- rldicl r8, r3, 0, 52 /* Page cross check. */
- cmpldi cr7, r8, 4096-16
- bgt cr7, L(bytebybyte)
-
- rldicl r8, r30, 0, 52
- cmpldi cr7, r8, 4096-16
- bgt cr7, L(bytebybyte)
-
- /* If len(r4) < 8 handle in a different way. */
- /* Shift position based on null and use cmpb. */
- cmpdi cr7, r31, 8
- blt cr7, L(lessthan8)
-
- /* Len(r4) >= 8 reaches here. */
- mr r8, r3 /* Save r3 for future use. */
- mr r4, r30 /* Restore r4. */
- li r0, 0
- rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
- clrrdi r4, r4, 3 /* Make r4 aligned to 8. */
- ld r6, 0(r4)
- addi r4, r4, 8
- cmpdi cr7, r10, 0 /* Check if its already aligned? */
- beq cr7, L(begin1)
-#ifdef __LITTLE_ENDIAN__
- srd r6, r6, r10 /* Discard unwanted bits. */
-#else
- sld r6, r6, r10
-#endif
- ld r9, 0(r4)
- subfic r10, r10, 64
-#ifdef __LITTLE_ENDIAN__
- sld r9, r9, r10 /* Discard unwanted bits. */
-#else
- srd r9, r9, r10
-#endif
- or r6, r6, r9 /* Form complete search str. */
-L(begin1):
- mr r29, r6
- rlwinm r10, r3, 3, 26, 28
- clrrdi r3, r3, 3
- ld r5, 0(r3)
- cmpb r9, r0, r6 /* Check if input has null. */
- cmpdi cr7, r9, 0
- bne cr7, L(return3)
- cmpb r9, r0, r5 /* Check if input has null. */
-#ifdef __LITTLE_ENDIAN__
- srd r9, r9, r10
-#else
- sld r9, r9, r10
-#endif
- cmpdi cr7, r9, 0
- bne cr7, L(retnull)
-
- li r12, -8 /* Shift values. */
- li r11, 72 /* Shift values. */
- cmpdi cr7, r10, 0
- beq cr7, L(nextbyte1)
- mr r12, r10
- addi r12, r12, -8
- subfic r11, r12, 64
-
-L(nextbyte1):
- ldu r7, 8(r3) /* Load next dw. */
- addi r12, r12, 8 /* Shift one byte and compare. */
- addi r11, r11, -8
-#ifdef __LITTLE_ENDIAN__
- srd r9, r5, r12 /* Rotate based on mask. */
- sld r10, r7, r11
-#else
- sld r9, r5, r12
- srd r10, r7, r11
-#endif
- /* Form single dw from few bytes on first load and second load. */
- or r10, r9, r10
- /* Check for null in the formed dw. */
- cmpb r9, r0, r10
- cmpdi cr7, r9, 0
- bne cr7, L(retnull)
- /* Cmpb search str and input str. */
- cmpb r9, r10, r6
- cmpdi cr7, r9, -1
- beq cr7, L(match)
- addi r8, r8, 1
- b L(begin)
-
- .align 4
-L(match):
- /* There is a match of 8 bytes, check next bytes. */
- cmpdi cr7, r31, 8
- beq cr7, L(return)
- /* Update next starting point r8. */
- srdi r9, r11, 3
- subf r9, r9, r3
- mr r8, r9
-
-L(secondmatch):
- mr r5, r7
- rlwinm r10, r30, 3, 26, 28 /* Calculate padding in bits. */
- ld r6, 0(r4)
- addi r4, r4, 8
- cmpdi cr7, r10, 0 /* Check if its already aligned? */
- beq cr7, L(proceed3)
-#ifdef __LITTLE_ENDIAN__
- srd r6, r6, r10 /* Discard unwanted bits. */
- cmpb r9, r0, r6
- sld r9, r9, r10
-#else
- sld r6, r6, r10
- cmpb r9, r0, r6
- srd r9, r9, r10
-#endif
- cmpdi cr7, r9, 0
- bne cr7, L(proceed3)
- ld r9, 0(r4)
- subfic r10, r10, 64
-#ifdef __LITTLE_ENDIAN__
- sld r9, r9, r10 /* Discard unwanted bits. */
-#else
- srd r9, r9, r10
-#endif
- or r6, r6, r9 /* Form complete search str. */
-
-L(proceed3):
- li r7, 0
- addi r3, r3, 8
- cmpb r9, r0, r5
- cmpdi cr7, r9, 0
- bne cr7, L(proceed4)
- ld r7, 0(r3)
-L(proceed4):
-#ifdef __LITTLE_ENDIAN__
- srd r9, r5, r12
- sld r10, r7, r11
-#else
- sld r9, r5, r12
- srd r10, r7, r11
-#endif
- /* Form single dw with few bytes from first and second load. */
- or r10, r9, r10
- cmpb r9, r0, r6
- cmpdi cr7, r9, 0
- bne cr7, L(return4)
- /* Check for null in the formed dw. */
- cmpb r9, r0, r10
- cmpdi cr7, r9, 0
- bne cr7, L(retnull)
- /* If the next 8 bytes dont match, start search again. */
- cmpb r9, r10, r6
- cmpdi cr7, r9, -1
- bne cr7, L(reset)
- /* If the next 8 bytes match, load and compare next 8. */
- b L(secondmatch)
-
- .align 4
-L(reset):
- /* Start the search again. */
- addi r8, r8, 1
- b L(begin)
-
- .align 4
-L(return3):
- /* Count leading zeros and compare partial dw. */
-#ifdef __LITTLE_ENDIAN__
- addi r7, r9, -1
- andc r7, r7, r9
- popcntd r7, r7
- subfic r7, r7, 64
- sld r10, r5, r7
- sld r6, r6, r7
-#else
- cntlzd r7, r9
- subfic r7, r7, 64
- srd r10, r5, r7
- srd r6, r6, r7
-#endif
- cmpb r9, r10, r6
- cmpdi cr7, r9, -1
- addi r8, r8, 1
- /* Start search again if there is no match. */
- bne cr7, L(begin)
- /* If the words match, update return values. */
- subfic r7, r7, 64
- srdi r7, r7, 3
- add r3, r3, r7
- subf r3, r31, r3
- b L(end)
-
- .align 4
-L(return4):
- /* Count leading zeros and compare partial dw. */
-#ifdef __LITTLE_ENDIAN__
- addi r7, r9, -1
- andc r7, r7, r9
- popcntd r7, r7
- subfic r7, r7, 64
- sld r10, r10, r7
- sld r6, r6, r7
-#else
- cntlzd r7, r9
- subfic r7, r7, 64
- srd r10, r10, r7
- srd r6, r6, r7
-#endif
- cmpb r9, r10, r6
- cmpdi cr7, r9, -1
- addi r8, r8, 1
- bne cr7, L(begin)
- subfic r7, r7, 64
- srdi r11, r11, 3
- subf r3, r11, r3
- srdi r7, r7, 3
- add r3, r3, r7
- subf r3, r31, r3
- b L(end)
-
- .align 4
-L(begin):
- mr r3, r8
- /* When our iterations exceed ITERATIONS,fall back to default. */
- addi r28, r28, 1
- cmpdi cr7, r28, ITERATIONS
- beq cr7, L(default)
- lbz r4, 0(r30)
- bl STRCHR
- nop
- /* If first char of search str is not present. */
- cmpdi cr7, r3, 0
- ble cr7, L(end)
- mr r8, r3
- mr r4, r30 /* Restore r4. */
- li r0, 0
- mr r6, r29
- clrrdi r4, r4, 3
- addi r4, r4, 8
- b L(begin1)
-
- /* Handle less than 8 search string. */
- .align 4
-L(lessthan8):
- mr r4, r3
- mr r9, r30
- li r0, 0
-
- rlwinm r10, r9, 3, 26, 28 /* Calculate padding in bits. */
- srdi r8, r10, 3 /* Padding in bytes. */
- clrrdi r9, r9, 3 /* Make r4 aligned to 8. */
- ld r6, 0(r9)
- cmpdi cr7, r10, 0 /* Check if its already aligned? */
- beq cr7, L(proceed2)
-#ifdef __LITTLE_ENDIAN__
- srd r6, r6, r10 /* Discard unwanted bits. */
-#else
- sld r6, r6, r10
-#endif
- subfic r8, r8, 8
- cmpd cr7, r8, r31 /* Next load needed? */
- bge cr7, L(proceed2)
- ld r7, 8(r9)
- subfic r10, r10, 64
-#ifdef __LITTLE_ENDIAN__
- sld r7, r7, r10 /* Discard unwanted bits. */
-#else
- srd r7, r7, r10
-#endif
- or r6, r6, r7 /* Form complete search str. */
-L(proceed2):
- mr r29, r6
- rlwinm r10, r3, 3, 26, 28
- clrrdi r7, r3, 3 /* Make r3 aligned. */
- ld r5, 0(r7)
- sldi r8, r31, 3
- subfic r8, r8, 64
-#ifdef __LITTLE_ENDIAN__
- sld r6, r6, r8
- cmpb r9, r0, r5
- srd r9, r9, r10
-#else
- srd r6, r6, r8
- cmpb r9, r0, r5
- sld r9, r9, r10
-#endif
- cmpdi cr7, r9, 0
- bne cr7, L(noload)
- cmpdi cr7, r10, 0
- beq cr7, L(continue)
- ld r7, 8(r7)
-L(continue1):
- mr r12, r10
- addi r12, r12, -8
- subfic r11, r12, 64
- b L(nextbyte)
-
- .align 4
-L(continue):
- ld r7, 8(r7)
- li r12, -8 /* Shift values. */
- li r11, 72 /* Shift values. */
-L(nextbyte):
- addi r12, r12, 8 /* Mask for rotation. */
- addi r11, r11, -8
-#ifdef __LITTLE_ENDIAN__
- srd r9, r5, r12
- sld r10, r7, r11
- or r10, r9, r10
- sld r10, r10, r8
- cmpb r9, r0, r10
- srd r9, r9, r8
-#else
- sld r9, r5, r12
- srd r10, r7, r11
- or r10, r9, r10
- srd r10, r10, r8
- cmpb r9, r0, r10
- sld r9, r9, r8
-#endif
- cmpdi cr7, r9, 0
- bne cr7, L(retnull)
- cmpb r9, r10, r6
- cmpdi cr7, r9, -1
- beq cr7, L(end)
- addi r3, r4, 1
- /* When our iterations exceed ITERATIONS,fall back to default. */
- addi r28, r28, 1
- cmpdi cr7, r28, ITERATIONS
- beq cr7, L(default)
- lbz r4, 0(r30)
- bl STRCHR
- nop
- /* If first char of search str is not present. */
- cmpdi cr7, r3, 0
- ble cr7, L(end)
- mr r4, r3
- mr r6, r29
- li r0, 0
- b L(proceed2)
-
- .align 4
-L(noload):
- /* Reached null in r3, so skip next load. */
- li r7, 0
- b L(continue1)
-
- .align 4
-L(return):
- /* Update return values. */
- srdi r9, r11, 3
- subf r3, r9, r3
- b L(end)
-
- /* Handling byte by byte. */
- .align 4
-L(bytebybyte):
- mr r8, r3
- addi r8, r8, -1
-L(loop1):
- addi r8, r8, 1
- mr r3, r8
- mr r4, r30
- lbz r6, 0(r4)
- cmpdi cr7, r6, 0
- beq cr7, L(updater3)
-L(loop):
- lbz r5, 0(r3)
- cmpdi cr7, r5, 0
- beq cr7, L(retnull)
- cmpld cr7, r6, r5
- bne cr7, L(loop1)
- addi r3, r3, 1
- addi r4, r4, 1
- lbz r6, 0(r4)
- cmpdi cr7, r6, 0
- beq cr7, L(updater3)
- b L(loop)
-
- /* Handling return values. */
- .align 4
-L(updater3):
- subf r3, r31, r3 /* Reduce len of r4 from r3. */
- b L(end)
-
- .align 4
-L(ret_r3):
- mr r3, r29 /* Return r3. */
- b L(end)
-
- .align 4
-L(retnull):
- li r3, 0 /* Return NULL. */
- b L(end)
-
- .align 4
-L(default):
- mr r4, r30
- bl __strstr_ppc
- nop
-
- .align 4
-L(end):
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- cfi_adjust_cfa_offset(-FRAMESIZE)
- ld r0, 16(r1) /* Restore the saved link register. */
- ld r28, -32(r1) /* Restore callers save register r28. */
- ld r29, -24(r1) /* Restore callers save register r29. */
- ld r30, -16(r1) /* Restore callers save register r30. */
- ld r31, -8(r1) /* Restore callers save register r31. */
- mtlr r0 /* Branch to link register. */
- blr
-END (STRSTR)
-libc_hidden_builtin_def (strstr)
diff --git a/sysdeps/powerpc/powerpc64/power7/sub_n.S b/sysdeps/powerpc/powerpc64/power7/sub_n.S
deleted file mode 100644
index 848dad5718..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/sub_n.S
+++ /dev/null
@@ -1,23 +0,0 @@
-/* PowerPC64 mpn_lshift -- mpn_add_n/mpn_sub_n -- mpn addition and
- subtraction.
- Copyright (C) 2013-2017 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define USE_AS_SUB
-#include "add_n.S"